__filename__ = "newsdaemon.py" __author__ = "Bob Mottram" __license__ = "AGPL3+" __version__ = "1.2.0" __maintainer__ = "Bob Mottram" __email__ = "bob@libreserver.org" __status__ = "Production" __module_group__ = "Web Interface Columns" # Example hashtag logic: # # if moderated and not #imcoxford then block # if #pol and contains "westminster" then add #britpol # if #unwantedtag then block import os import time import datetime import html from shutil import rmtree from subprocess import Popen from collections import OrderedDict from newswire import getDictFromNewswire # from posts import sendSignedJson from posts import createNewsPost from posts import archivePostsForPerson from content import validHashTag from utils import get_base_content_from_post from utils import remove_html from utils import get_full_domain from utils import load_json from utils import save_json from utils import get_status_number from utils import clear_from_post_caches from utils import dangerous_markup from utils import local_actor_url from inbox import storeHashTags from session import createSession def _updateFeedsOutboxIndex(base_dir: str, domain: str, post_id: str) -> None: """Updates the index used for imported RSS feeds """ basePath = base_dir + '/accounts/news@' + domain indexFilename = basePath + '/outbox.index' if os.path.isfile(indexFilename): if post_id not in open(indexFilename).read(): try: with open(indexFilename, 'r+') as feedsFile: content = feedsFile.read() if post_id + '\n' not in content: feedsFile.seek(0, 0) feedsFile.write(post_id + '\n' + content) print('DEBUG: feeds post added to index') except Exception as ex: print('WARN: Failed to write entry to feeds posts index ' + indexFilename + ' ' + str(ex)) else: try: with open(indexFilename, 'w+') as feedsFile: feedsFile.write(post_id + '\n') except OSError: print('EX: unable to write ' + indexFilename) def _saveArrivedTime(base_dir: str, post_filename: str, arrived: str) -> None: """Saves the time when an rss post arrived to a file """ try: with open(post_filename + '.arrived', 'w+') as arrivedFile: arrivedFile.write(arrived) except OSError: print('EX: unable to write ' + post_filename + '.arrived') def _removeControlCharacters(content: str) -> str: """Remove escaped html """ if '&' in content: return html.unescape(content) return content def _hashtagLogicalNot(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ NOT """ if len(tree) != 2: return False if isinstance(tree[1], str): return tree[1] not in hashtags elif isinstance(tree[1], list): return not hashtagRuleResolve(tree[1], hashtags, moderated, content, url) return False def _hashtagLogicalContains(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ Contains """ if len(tree) != 2: return False matchStr = None if isinstance(tree[1], str): matchStr = tree[1] elif isinstance(tree[1], list): matchStr = tree[1][0] if matchStr: if matchStr.startswith('"') and matchStr.endswith('"'): matchStr = matchStr[1:] matchStr = matchStr[:len(matchStr) - 1] matchStrLower = matchStr.lower() contentWithoutTags = content.replace('#' + matchStrLower, '') return matchStrLower in contentWithoutTags return False def _hashtagLogicalFrom(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ FROM """ if len(tree) != 2: return False matchStr = None if isinstance(tree[1], str): matchStr = tree[1] elif isinstance(tree[1], list): matchStr = tree[1][0] if matchStr: if matchStr.startswith('"') and matchStr.endswith('"'): matchStr = matchStr[1:] matchStr = matchStr[:len(matchStr) - 1] return matchStr.lower() in url return False def _hashtagLogicalAnd(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ AND """ if len(tree) < 3: return False for argIndex in range(1, len(tree)): argValue = False if isinstance(tree[argIndex], str): argValue = (tree[argIndex] in hashtags) elif isinstance(tree[argIndex], list): argValue = hashtagRuleResolve(tree[argIndex], hashtags, moderated, content, url) if not argValue: return False return True def _hashtagLogicalOr(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ OR """ if len(tree) < 3: return False for argIndex in range(1, len(tree)): argValue = False if isinstance(tree[argIndex], str): argValue = (tree[argIndex] in hashtags) elif isinstance(tree[argIndex], list): argValue = hashtagRuleResolve(tree[argIndex], hashtags, moderated, content, url) if argValue: return True return False def _hashtagLogicalXor(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """ XOR """ if len(tree) < 3: return False trueCtr = 0 for argIndex in range(1, len(tree)): argValue = False if isinstance(tree[argIndex], str): argValue = (tree[argIndex] in hashtags) elif isinstance(tree[argIndex], list): argValue = hashtagRuleResolve(tree[argIndex], hashtags, moderated, content, url) if argValue: trueCtr += 1 if trueCtr == 1: return True return False def hashtagRuleResolve(tree: [], hashtags: [], moderated: bool, content: str, url: str) -> bool: """Returns whether the tree for a hashtag rule evaluates to true or false """ if not tree: return False if tree[0] == 'not': return _hashtagLogicalNot(tree, hashtags, moderated, content, url) elif tree[0] == 'contains': return _hashtagLogicalContains(tree, hashtags, moderated, content, url) elif tree[0] == 'from': return _hashtagLogicalFrom(tree, hashtags, moderated, content, url) elif tree[0] == 'and': return _hashtagLogicalAnd(tree, hashtags, moderated, content, url) elif tree[0] == 'or': return _hashtagLogicalOr(tree, hashtags, moderated, content, url) elif tree[0] == 'xor': return _hashtagLogicalXor(tree, hashtags, moderated, content, url) elif tree[0].startswith('#') and len(tree) == 1: return tree[0] in hashtags elif tree[0].startswith('moderated'): return moderated elif tree[0].startswith('"') and tree[0].endswith('"'): return True return False def hashtagRuleTree(operators: [], conditionsStr: str, tagsInConditions: [], moderated: bool) -> []: """Walks the tree """ if not operators and conditionsStr: conditionsStr = conditionsStr.strip() isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"') if conditionsStr.startswith('#') or isStr or \ conditionsStr in operators or \ conditionsStr == 'moderated' or \ conditionsStr == 'contains': if conditionsStr.startswith('#'): if conditionsStr not in tagsInConditions: if ' ' not in conditionsStr or \ conditionsStr.startswith('"'): tagsInConditions.append(conditionsStr) return [conditionsStr.strip()] else: return None if not operators or not conditionsStr: return None tree = None conditionsStr = conditionsStr.strip() isStr = conditionsStr.startswith('"') and conditionsStr.endswith('"') if conditionsStr.startswith('#') or isStr or \ conditionsStr in operators or \ conditionsStr == 'moderated' or \ conditionsStr == 'contains': if conditionsStr.startswith('#'): if conditionsStr not in tagsInConditions: if ' ' not in conditionsStr or \ conditionsStr.startswith('"'): tagsInConditions.append(conditionsStr) tree = [conditionsStr.strip()] ctr = 0 while ctr < len(operators): op = operators[ctr] opMatch = ' ' + op + ' ' if opMatch not in conditionsStr and \ not conditionsStr.startswith(op + ' '): ctr += 1 continue else: tree = [op] if opMatch in conditionsStr: sections = conditionsStr.split(opMatch) else: sections = conditionsStr.split(op + ' ', 1) for subConditionStr in sections: result = hashtagRuleTree(operators[ctr + 1:], subConditionStr, tagsInConditions, moderated) if result: tree.append(result) break return tree def _hashtagAdd(base_dir: str, http_prefix: str, domain_full: str, post_json_object: {}, actionStr: str, hashtags: [], system_language: str, translate: {}) -> None: """Adds a hashtag via a hashtag rule """ addHashtag = actionStr.split('add ', 1)[1].strip() if not addHashtag.startswith('#'): return if addHashtag not in hashtags: hashtags.append(addHashtag) htId = addHashtag.replace('#', '') if not validHashTag(htId): return hashtagUrl = http_prefix + "://" + domain_full + "/tags/" + htId newTag = { 'href': hashtagUrl, 'name': addHashtag, 'type': 'Hashtag' } # does the tag already exist? addTagObject = None for t in post_json_object['object']['tag']: if t.get('type') and t.get('name'): if t['type'] == 'Hashtag' and \ t['name'] == addHashtag: addTagObject = t break # append the tag if it wasn't found if not addTagObject: post_json_object['object']['tag'].append(newTag) # add corresponding html to the post content hashtagHtml = \ " #" + htId + "" content = get_base_content_from_post(post_json_object, system_language) if hashtagHtml in content: return if content.endswith('

'): content = \ content[:len(content) - len('

')] + \ hashtagHtml + '

' else: content += hashtagHtml post_json_object['object']['content'] = content domain = domain_full if ':' in domain: domain = domain.split(':')[0] storeHashTags(base_dir, 'news', domain, http_prefix, domain_full, post_json_object, translate) def _hashtagRemove(http_prefix: str, domain_full: str, post_json_object: {}, actionStr: str, hashtags: [], system_language: str) -> None: """Removes a hashtag via a hashtag rule """ rmHashtag = actionStr.split('remove ', 1)[1].strip() if not rmHashtag.startswith('#'): return if rmHashtag in hashtags: hashtags.remove(rmHashtag) htId = rmHashtag.replace('#', '') hashtagUrl = http_prefix + "://" + domain_full + "/tags/" + htId # remove tag html from the post content hashtagHtml = \ "#" + htId + "" content = get_base_content_from_post(post_json_object, system_language) if hashtagHtml in content: content = content.replace(hashtagHtml, '').replace(' ', ' ') post_json_object['object']['content'] = content post_json_object['object']['contentMap'][system_language] = content rmTagObject = None for t in post_json_object['object']['tag']: if t.get('type') and t.get('name'): if t['type'] == 'Hashtag' and \ t['name'] == rmHashtag: rmTagObject = t break if rmTagObject: post_json_object['object']['tag'].remove(rmTagObject) def _newswireHashtagProcessing(session, base_dir: str, post_json_object: {}, hashtags: [], http_prefix: str, domain: str, port: int, person_cache: {}, cached_webfingers: {}, federation_list: [], send_threads: [], postLog: [], moderated: bool, url: str, system_language: str, translate: {}) -> bool: """Applies hashtag rules to a news post. Returns true if the post should be saved to the news timeline of this instance """ rulesFilename = base_dir + '/accounts/hashtagrules.txt' if not os.path.isfile(rulesFilename): return True rules = [] with open(rulesFilename, 'r') as f: rules = f.readlines() domain_full = get_full_domain(domain, port) # get the full text content of the post content = '' if post_json_object['object'].get('content'): content += get_base_content_from_post(post_json_object, system_language) if post_json_object['object'].get('summary'): content += ' ' + post_json_object['object']['summary'] content = content.lower() # actionOccurred = False operators = ('not', 'and', 'or', 'xor', 'from', 'contains') for ruleStr in rules: if not ruleStr: continue if not ruleStr.startswith('if '): continue if ' then ' not in ruleStr: continue conditionsStr = ruleStr.split('if ', 1)[1] conditionsStr = conditionsStr.split(' then ')[0] tagsInConditions = [] tree = hashtagRuleTree(operators, conditionsStr, tagsInConditions, moderated) if not hashtagRuleResolve(tree, hashtags, moderated, content, url): continue # the condition matches, so do something actionStr = ruleStr.split(' then ')[1].strip() if actionStr.startswith('add '): # add a hashtag _hashtagAdd(base_dir, http_prefix, domain_full, post_json_object, actionStr, hashtags, system_language, translate) elif actionStr.startswith('remove '): # remove a hashtag _hashtagRemove(http_prefix, domain_full, post_json_object, actionStr, hashtags, system_language) elif actionStr.startswith('block') or actionStr.startswith('drop'): # Block this item return False return True def _createNewsMirror(base_dir: str, domain: str, post_idNumber: str, url: str, max_mirrored_articles: int) -> bool: """Creates a local mirror of a news article """ if '|' in url or '>' in url: return True mirrorDir = base_dir + '/accounts/newsmirror' if not os.path.isdir(mirrorDir): os.mkdir(mirrorDir) # count the directories noOfDirs = 0 for subdir, dirs, files in os.walk(mirrorDir): noOfDirs = len(dirs) mirrorIndexFilename = base_dir + '/accounts/newsmirror.txt' if max_mirrored_articles > 0 and noOfDirs > max_mirrored_articles: if not os.path.isfile(mirrorIndexFilename): # no index for mirrors found return True removals = [] with open(mirrorIndexFilename, 'r') as indexFile: # remove the oldest directories ctr = 0 while noOfDirs > max_mirrored_articles: ctr += 1 if ctr > 5000: # escape valve break post_id = indexFile.readline() if not post_id: continue post_id = post_id.strip() mirrorArticleDir = mirrorDir + '/' + post_id if os.path.isdir(mirrorArticleDir): rmtree(mirrorArticleDir, ignore_errors=False, onerror=None) removals.append(post_id) noOfDirs -= 1 # remove the corresponding index entries if removals: indexContent = '' with open(mirrorIndexFilename, 'r') as indexFile: indexContent = indexFile.read() for removePostId in removals: indexContent = \ indexContent.replace(removePostId + '\n', '') try: with open(mirrorIndexFilename, 'w+') as indexFile: indexFile.write(indexContent) except OSError: print('EX: unable to write ' + mirrorIndexFilename) mirrorArticleDir = mirrorDir + '/' + post_idNumber if os.path.isdir(mirrorArticleDir): # already mirrored return True # for onion instances mirror via tor prefixStr = '' if domain.endswith('.onion'): prefixStr = '/usr/bin/torsocks ' # download the files commandStr = \ prefixStr + '/usr/bin/wget -mkEpnp -e robots=off ' + url + \ ' -P ' + mirrorArticleDir p = Popen(commandStr, shell=True) os.waitpid(p.pid, 0) if not os.path.isdir(mirrorArticleDir): print('WARN: failed to mirror ' + url) return True # append the post Id number to the index file if os.path.isfile(mirrorIndexFilename): try: with open(mirrorIndexFilename, 'a+') as indexFile: indexFile.write(post_idNumber + '\n') except OSError: print('EX: unable to append ' + mirrorIndexFilename) else: try: with open(mirrorIndexFilename, 'w+') as indexFile: indexFile.write(post_idNumber + '\n') except OSError: print('EX: unable to write ' + mirrorIndexFilename) return True def _convertRSStoActivityPub(base_dir: str, http_prefix: str, domain: str, port: int, newswire: {}, translate: {}, recent_posts_cache: {}, max_recent_posts: int, session, cached_webfingers: {}, person_cache: {}, federation_list: [], send_threads: [], postLog: [], max_mirrored_articles: int, allow_local_network_access: bool, system_language: str, low_bandwidth: bool, content_license_url: str) -> None: """Converts rss items in a newswire into posts """ if not newswire: print('No newswire to convert') return basePath = base_dir + '/accounts/news@' + domain + '/outbox' if not os.path.isdir(basePath): os.mkdir(basePath) # oldest items first newswireReverse = OrderedDict(sorted(newswire.items(), reverse=False)) for dateStr, item in newswireReverse.items(): originalDateStr = dateStr # convert the date to the format used by ActivityPub if '+00:00' in dateStr: dateStr = dateStr.replace(' ', 'T') dateStr = dateStr.replace('+00:00', 'Z') else: try: dateStrWithOffset = \ datetime.datetime.strptime(dateStr, "%Y-%m-%d %H:%M:%S%z") except BaseException: print('EX: Newswire strptime failed ' + str(dateStr)) continue try: dateStr = dateStrWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ") except BaseException: print('EX: Newswire dateStrWithOffset failed ' + str(dateStrWithOffset)) continue statusNumber, published = get_status_number(dateStr) newPostId = \ local_actor_url(http_prefix, 'news', domain) + \ '/statuses/' + statusNumber # file where the post is stored filename = basePath + '/' + newPostId.replace('/', '#') + '.json' if os.path.isfile(filename): # don't create the post if it already exists # set the url # newswire[originalDateStr][1] = \ # '/users/news/statuses/' + statusNumber # set the filename newswire[originalDateStr][3] = filename continue rssTitle = _removeControlCharacters(item[0]) url = item[1] if dangerous_markup(url, allow_local_network_access) or \ dangerous_markup(rssTitle, allow_local_network_access): continue rssDescription = '' # get the rss description if it exists rssDescription = '

' + remove_html(item[4]) + '

' mirrored = item[7] postUrl = url if mirrored and '://' in url: postUrl = '/newsmirror/' + statusNumber + '/' + \ url.split('://')[1] if postUrl.endswith('/'): postUrl += 'index.html' else: postUrl += '/index.html' # add the off-site link to the description rssDescription += \ '
' + \ translate['Read more...'] + '' followersOnly = False # NOTE: the id when the post is created will not be # consistent (it's based on the current time, not the # published time), so we change that later saveToFile = False attachImageFilename = None mediaType = None imageDescription = None city = 'London, England' conversationId = None blog = createNewsPost(base_dir, domain, port, http_prefix, rssDescription, followersOnly, saveToFile, attachImageFilename, mediaType, imageDescription, city, rssTitle, system_language, conversationId, low_bandwidth, content_license_url) if not blog: continue if mirrored: if not _createNewsMirror(base_dir, domain, statusNumber, url, max_mirrored_articles): continue idStr = \ local_actor_url(http_prefix, 'news', domain) + \ '/statuses/' + statusNumber + '/replies' blog['news'] = True # note the time of arrival curr_time = datetime.datetime.utcnow() blog['object']['arrived'] = curr_time.strftime("%Y-%m-%dT%H:%M:%SZ") # change the id, based upon the published time blog['object']['replies']['id'] = idStr blog['object']['replies']['first']['partOf'] = idStr blog['id'] = newPostId + '/activity' blog['object']['id'] = newPostId blog['object']['atomUri'] = newPostId blog['object']['url'] = \ http_prefix + '://' + domain + '/@news/' + statusNumber blog['object']['published'] = dateStr blog['object']['content'] = rssDescription blog['object']['contentMap'][system_language] = rssDescription domain_full = get_full_domain(domain, port) hashtags = item[6] post_id = newPostId.replace('/', '#') moderated = item[5] savePost = _newswireHashtagProcessing(session, base_dir, blog, hashtags, http_prefix, domain, port, person_cache, cached_webfingers, federation_list, send_threads, postLog, moderated, url, system_language, translate) # save the post and update the index if savePost: # ensure that all hashtags are stored in the json # and appended to the content blog['object']['tag'] = [] for tagName in hashtags: htId = tagName.replace('#', '') hashtagUrl = \ http_prefix + "://" + domain_full + "/tags/" + htId newTag = { 'href': hashtagUrl, 'name': tagName, 'type': 'Hashtag' } blog['object']['tag'].append(newTag) hashtagHtml = \ " #" + \ htId + "" content = get_base_content_from_post(blog, system_language) if hashtagHtml not in content: if content.endswith('

'): content = \ content[:len(content) - len('

')] + \ hashtagHtml + '

' else: content += hashtagHtml blog['object']['content'] = content blog['object']['contentMap'][system_language] = content # update the newswire tags if new ones have been found by # _newswireHashtagProcessing for tag in hashtags: if tag not in newswire[originalDateStr][6]: newswire[originalDateStr][6].append(tag) storeHashTags(base_dir, 'news', domain, http_prefix, domain_full, blog, translate) clear_from_post_caches(base_dir, recent_posts_cache, post_id) if save_json(blog, filename): _updateFeedsOutboxIndex(base_dir, domain, post_id + '.json') # Save a file containing the time when the post arrived # this can then later be used to construct the news timeline # excluding items during the voting period if moderated: _saveArrivedTime(base_dir, filename, blog['object']['arrived']) else: if os.path.isfile(filename + '.arrived'): try: os.remove(filename + '.arrived') except OSError: print('EX: _convertRSStoActivityPub ' + 'unable to delete ' + filename + '.arrived') # setting the url here links to the activitypub object # stored locally # newswire[originalDateStr][1] = \ # '/users/news/statuses/' + statusNumber # set the filename newswire[originalDateStr][3] = filename def _mergeWithPreviousNewswire(oldNewswire: {}, newNewswire: {}) -> None: """Preserve any votes or generated activitypub post filename as rss feeds are updated """ if not oldNewswire: return for published, fields in oldNewswire.items(): if not newNewswire.get(published): continue for i in range(1, 5): newNewswire[published][i] = fields[i] def runNewswireDaemon(base_dir: str, httpd, http_prefix: str, domain: str, port: int, translate: {}) -> None: """Periodically updates RSS feeds """ newswireStateFilename = base_dir + '/accounts/.newswirestate.json' refreshFilename = base_dir + '/accounts/.refresh_newswire' # initial sleep to allow the system to start up time.sleep(50) while True: # has the session been created yet? if not httpd.session: print('Newswire daemon waiting for session') httpd.session = createSession(httpd.proxy_type) if not httpd.session: print('Newswire daemon has no session') time.sleep(60) continue else: print('Newswire daemon session established') # try to update the feeds print('Updating newswire feeds') newNewswire = \ getDictFromNewswire(httpd.session, base_dir, domain, httpd.max_newswire_postsPerSource, httpd.max_newswire_feed_size_kb, httpd.maxTags, httpd.max_feed_item_size_kb, httpd.max_newswire_posts, httpd.maxCategoriesFeedItemSizeKb, httpd.system_language, httpd.debug) if not httpd.newswire: print('Newswire feeds not updated') if os.path.isfile(newswireStateFilename): print('Loading newswire from file') httpd.newswire = load_json(newswireStateFilename) print('Merging with previous newswire') _mergeWithPreviousNewswire(httpd.newswire, newNewswire) httpd.newswire = newNewswire if newNewswire: save_json(httpd.newswire, newswireStateFilename) print('Newswire updated') else: print('No new newswire') print('Converting newswire to activitypub format') _convertRSStoActivityPub(base_dir, http_prefix, domain, port, newNewswire, translate, httpd.recent_posts_cache, httpd.max_recent_posts, httpd.session, httpd.cached_webfingers, httpd.person_cache, httpd.federation_list, httpd.send_threads, httpd.postLog, httpd.max_mirrored_articles, httpd.allow_local_network_access, httpd.system_language, httpd.low_bandwidth, httpd.content_license_url) print('Newswire feed converted to ActivityPub') if httpd.max_news_posts > 0: archive_dir = base_dir + '/archive' archiveSubdir = \ archive_dir + '/accounts/news@' + domain + '/outbox' print('Archiving news posts') archivePostsForPerson(http_prefix, 'news', domain, base_dir, 'outbox', archiveSubdir, httpd.recent_posts_cache, httpd.max_news_posts) # wait a while before the next feeds update for tick in range(120): time.sleep(10) # if a new blog post has been created then stop # waiting and recalculate the newswire if os.path.isfile(refreshFilename): try: os.remove(refreshFilename) except OSError: print('EX: runNewswireDaemon unable to delete ' + str(refreshFilename)) break def runNewswireWatchdog(project_version: str, httpd) -> None: """This tries to keep the newswire update thread running even if it dies """ print('Starting newswire watchdog') newswireOriginal = \ httpd.thrPostSchedule.clone(runNewswireDaemon) httpd.thrNewswireDaemon.start() while True: time.sleep(50) if httpd.thrNewswireDaemon.is_alive(): continue httpd.thrNewswireDaemon.kill() httpd.thrNewswireDaemon = \ newswireOriginal.clone(runNewswireDaemon) httpd.thrNewswireDaemon.start() print('Restarting newswire daemon...')