From ee9d9a9dc52662f08d0fd30dcab46d6e263fd3c4 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Mon, 19 Oct 2020 20:26:58 +0100 Subject: [PATCH] Ability to mirror rss feed content --- blocking.py | 10 ++++--- newsdaemon.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/blocking.py b/blocking.py index b82cf3e0..e2493395 100644 --- a/blocking.py +++ b/blocking.py @@ -28,8 +28,9 @@ def addGlobalBlock(baseDir: str, return False # block an account handle or domain blockFile = open(blockingFilename, "a+") - blockFile.write(blockHandle + '\n') - blockFile.close() + if blockFile: + blockFile.write(blockHandle + '\n') + blockFile.close() else: blockHashtag = blockNickname # is the hashtag already blocked? @@ -38,8 +39,9 @@ def addGlobalBlock(baseDir: str, return False # block a hashtag blockFile = open(blockingFilename, "a+") - blockFile.write(blockHashtag + '\n') - blockFile.close() + if blockFile: + blockFile.write(blockHashtag + '\n') + blockFile.close() return True diff --git a/newsdaemon.py b/newsdaemon.py index 7dd232f9..8caf2761 100644 --- a/newsdaemon.py +++ b/newsdaemon.py @@ -15,6 +15,8 @@ __status__ = "Production" import os import time import datetime +from shutil import rmtree +from subprocess import Popen from collections import OrderedDict from newswire import getDictFromNewswire # from posts import sendSignedJson @@ -348,14 +350,86 @@ def newswireHashtagProcessing(session, baseDir: str, postJsonObject: {}, return True -def createNewsMirror(baseDir: str, url: str, +def createNewsMirror(baseDir: str, postIdNumber: str, url: str, maxMirroredArticles: int) -> bool: """Creates a local mirror of a news article """ + if '|' in url or '>' in url: + return True + mirrorDir = baseDir + '/accounts/newsmirror' if not os.path.isdir(mirrorDir): os.mkdir(mirrorDir) + # count the directories + noOfDirs = 0 + for subdir, dirs, files in os.walk(mirrorDir): + noOfDirs = len(dirs) + + mirrorIndexFilename = baseDir + '/accounts/newsmirror.txt' + + if maxMirroredArticles > 0 and noOfDirs > maxMirroredArticles: + if not os.path.isfile(mirrorIndexFilename): + # no index for mirrors found + return True + removals = [] + with open(mirrorIndexFilename, 'r') as indexFile: + # remove the oldest directories + ctr = 0 + while noOfDirs > maxMirroredArticles: + ctr += 1 + if ctr > 5000: + # escape valve + break + + postId = indexFile.readline() + if not postId: + continue + postId = postId.strip() + mirrorArticleDir = mirrorDir + '/' + postId + if os.path.isdir(mirrorArticleDir): + rmtree(mirrorArticleDir) + removals.append(postId) + noOfDirs -= 1 + + # remove the corresponding index entries + if removals: + indexContent = '' + with open(mirrorIndexFilename, 'r') as indexFile: + indexContent = indexFile.read() + for removePostId in removals: + indexContent = \ + indexContent.replace(removePostId + '\n', '') + with open(mirrorIndexFilename, "w+") as indexFile: + indexFile.write(indexContent) + + mirrorArticleDir = mirrorDir + '/' + postIdNumber + if os.path.isdir(mirrorArticleDir): + # already mirrored + return True + + # download the files + commandStr = \ + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \ + ' -P ' + mirrorArticleDir + p = Popen(commandStr, shell=True) + os.waitpid(p.pid, 0) + + if not os.path.isdir(mirrorArticleDir): + return True + + # append the post Id number to the index file + if os.path.isfile(mirrorIndexFilename): + indexFile = open(mirrorIndexFilename, "a+") + if indexFile: + indexFile.write(postIdNumber + '\n') + indexFile.close() + else: + indexFile = open(mirrorIndexFilename, "w+") + if indexFile: + indexFile.write(postIdNumber + '\n') + indexFile.close() + return True @@ -445,7 +519,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str, mirrored = item[7] if mirrored: - if not createNewsMirror(baseDir, url, maxMirroredArticles): + if not createNewsMirror(baseDir, statusNumber, + url, maxMirroredArticles): continue idStr = \