| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __filename__ = "newswire.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2021-01-26 10:07:42 +00:00
										 |  |  | __version__ = "1.2.0" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-26 11:27:14 +00:00
										 |  |  | __module_group__ = "Web Interface Columns" | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  | import json | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | import requests | 
					
						
							|  |  |  | from socket import error as SocketError | 
					
						
							|  |  |  | import errno | 
					
						
							|  |  |  | from datetime import datetime | 
					
						
							| 
									
										
										
										
											2020-11-22 15:33:11 +00:00
										 |  |  | from datetime import timedelta | 
					
						
							| 
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 |  |  | from datetime import timezone | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | from collections import OrderedDict | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  | from utils import validPostDate | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | from categories import setHashtagCategory | 
					
						
							| 
									
										
										
										
											2021-07-20 13:33:27 +00:00
										 |  |  | from utils import getBaseContentFromPost | 
					
						
							| 
									
										
										
										
											2021-06-22 15:45:59 +00:00
										 |  |  | from utils import hasObjectDict | 
					
						
							| 
									
										
										
										
											2020-11-08 10:45:33 +00:00
										 |  |  | from utils import firstParagraphFromString | 
					
						
							| 
									
										
										
										
											2020-10-25 10:42:38 +00:00
										 |  |  | from utils import isPublicPost | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | from utils import locatePost | 
					
						
							|  |  |  | from utils import loadJson | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  | from utils import saveJson | 
					
						
							| 
									
										
										
										
											2020-10-06 08:58:44 +00:00
										 |  |  | from utils import isSuspended | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  | from utils import containsInvalidChars | 
					
						
							| 
									
										
										
										
											2020-10-25 12:47:16 +00:00
										 |  |  | from utils import removeHtml | 
					
						
							| 
									
										
										
										
											2021-07-04 17:55:29 +00:00
										 |  |  | from utils import isAccountDir | 
					
						
							| 
									
										
										
										
											2021-07-13 21:59:53 +00:00
										 |  |  | from utils import acctDir | 
					
						
							| 
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 |  |  | from utils import localActorUrl | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  | from blocking import isBlockedDomain | 
					
						
							| 
									
										
										
										
											2020-10-17 14:23:35 +00:00
										 |  |  | from blocking import isBlockedHashtag | 
					
						
							| 
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 |  |  | from filters import isFiltered | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-16 12:11:05 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _removeCDATA(text: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:18:43 +00:00
										 |  |  |     """Removes any CDATA from the given text
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if 'CDATA[' in text: | 
					
						
							|  |  |  |         text = text.split('CDATA[')[1] | 
					
						
							|  |  |  |         if ']' in text: | 
					
						
							|  |  |  |             text = text.split(']')[0] | 
					
						
							|  |  |  |     return text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  | def rss2Header(httpPrefix: str, | 
					
						
							|  |  |  |                nickname: str, domainFull: str, | 
					
						
							|  |  |  |                title: str, translate: {}) -> str: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 |  |  |     """Header for an RSS 2.0 feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |     rssStr = \ | 
					
						
							|  |  |  |         "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + \ | 
					
						
							|  |  |  |         "<rss version=\"2.0\">" + \ | 
					
						
							|  |  |  |         '<channel>' | 
					
						
							| 
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     if title.startswith('News'): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |         rssStr += \ | 
					
						
							|  |  |  |             '    <title>Newswire</title>' + \ | 
					
						
							|  |  |  |             '    <link>' + httpPrefix + '://' + domainFull + \ | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |             '/newswire.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2020-10-13 17:14:57 +00:00
										 |  |  |     elif title.startswith('Site'): | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |         rssStr += \ | 
					
						
							|  |  |  |             '    <title>' + domainFull + '</title>' + \ | 
					
						
							|  |  |  |             '    <link>' + httpPrefix + '://' + domainFull + \ | 
					
						
							| 
									
										
										
										
											2020-10-13 17:17:17 +00:00
										 |  |  |             '/blog/rss.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     else: | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |         rssStr += \ | 
					
						
							|  |  |  |             '    <title>' + translate[title] + '</title>' + \ | 
					
						
							| 
									
										
										
										
											2021-08-14 11:13:39 +00:00
										 |  |  |             '    <link>' + \ | 
					
						
							|  |  |  |             localActorUrl(httpPrefix, nickname, domainFull) + \ | 
					
						
							|  |  |  |             '/rss.xml' + '</link>' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     return rssStr | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def rss2Footer() -> str: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:22:23 +00:00
										 |  |  |     """Footer for an RSS 2.0 feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-07-04 09:50:09 +00:00
										 |  |  |     rssStr = '</channel></rss>' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     return rssStr | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-23 14:41:29 +00:00
										 |  |  | def getNewswireTags(text: str, maxTags: int) -> []: | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     """Returns a list of hashtags found in the given text
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-16 20:46:34 +00:00
										 |  |  |     if '#' not in text: | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     if ' ' not in text: | 
					
						
							|  |  |  |         return [] | 
					
						
							|  |  |  |     textSimplified = \ | 
					
						
							|  |  |  |         text.replace(',', ' ').replace(';', ' ').replace('- ', ' ') | 
					
						
							|  |  |  |     textSimplified = textSimplified.replace('. ', ' ').strip() | 
					
						
							|  |  |  |     if textSimplified.endswith('.'): | 
					
						
							|  |  |  |         textSimplified = textSimplified[:len(textSimplified)-1] | 
					
						
							|  |  |  |     words = textSimplified.split(' ') | 
					
						
							|  |  |  |     tags = [] | 
					
						
							|  |  |  |     for wrd in words: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |         if not wrd.startswith('#'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(wrd) <= 1: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if wrd in tags: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         tags.append(wrd) | 
					
						
							|  |  |  |         if len(tags) >= maxTags: | 
					
						
							|  |  |  |             break | 
					
						
							| 
									
										
										
										
											2020-10-16 19:49:34 +00:00
										 |  |  |     return tags | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  | def limitWordLengths(text: str, maxWordLength: int) -> str: | 
					
						
							|  |  |  |     """Limits the maximum length of words so that the newswire
 | 
					
						
							|  |  |  |     column cannot become too wide | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if ' ' not in text: | 
					
						
							|  |  |  |         return text | 
					
						
							|  |  |  |     words = text.split(' ') | 
					
						
							|  |  |  |     result = '' | 
					
						
							|  |  |  |     for wrd in words: | 
					
						
							|  |  |  |         if len(wrd) > maxWordLength: | 
					
						
							|  |  |  |             wrd = wrd[:maxWordLength] | 
					
						
							|  |  |  |         if result: | 
					
						
							|  |  |  |             result += ' ' | 
					
						
							|  |  |  |         result += wrd | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _addNewswireDictEntry(baseDir: str, domain: str, | 
					
						
							|  |  |  |                           newswire: {}, dateStr: str, | 
					
						
							|  |  |  |                           title: str, link: str, | 
					
						
							|  |  |  |                           votesStatus: str, postFilename: str, | 
					
						
							|  |  |  |                           description: str, moderated: bool, | 
					
						
							|  |  |  |                           mirrored: bool, | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |                           tags: [] = [], | 
					
						
							|  |  |  |                           maxTags: int = 32) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 |  |  |     """Update the newswire dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-12 15:44:43 +00:00
										 |  |  |     # remove any markup | 
					
						
							|  |  |  |     title = removeHtml(title) | 
					
						
							|  |  |  |     description = removeHtml(description) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     allText = title + ' ' + description | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # check that none of the text is filtered against | 
					
						
							| 
									
										
										
										
											2020-12-19 11:43:20 +00:00
										 |  |  |     if isFiltered(baseDir, None, None, allText): | 
					
						
							| 
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 |  |  |         return | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |     title = limitWordLengths(title, 13) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |     if tags is None: | 
					
						
							|  |  |  |         tags = [] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # extract hashtags from the text of the feed post | 
					
						
							|  |  |  |     postTags = getNewswireTags(allText, maxTags) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # combine the tags into a single list | 
					
						
							| 
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 |  |  |     for tag in tags: | 
					
						
							| 
									
										
										
										
											2021-07-07 14:00:53 +00:00
										 |  |  |         if tag in postTags: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(postTags) < maxTags: | 
					
						
							|  |  |  |             postTags.append(tag) | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # check that no tags are blocked | 
					
						
							| 
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 |  |  |     for tag in postTags: | 
					
						
							| 
									
										
										
										
											2020-12-03 19:51:47 +00:00
										 |  |  |         if isBlockedHashtag(baseDir, tag): | 
					
						
							| 
									
										
										
										
											2020-10-25 10:18:07 +00:00
										 |  |  |             return | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     newswire[dateStr] = [ | 
					
						
							|  |  |  |         title, | 
					
						
							|  |  |  |         link, | 
					
						
							|  |  |  |         votesStatus, | 
					
						
							|  |  |  |         postFilename, | 
					
						
							|  |  |  |         description, | 
					
						
							|  |  |  |         moderated, | 
					
						
							| 
									
										
										
										
											2020-10-25 12:57:14 +00:00
										 |  |  |         postTags, | 
					
						
							| 
									
										
										
										
											2020-10-25 10:17:12 +00:00
										 |  |  |         mirrored | 
					
						
							|  |  |  |     ] | 
					
						
							| 
									
										
										
										
											2020-10-16 19:25:55 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-20 11:28:35 +00:00
										 |  |  | def _validFeedDate(pubDate: str, debug: bool = False) -> bool: | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |     # convert from YY-MM-DD HH:MM:SS+00:00 to | 
					
						
							|  |  |  |     # YY-MM-DDTHH:MM:SSZ | 
					
						
							|  |  |  |     postDate = pubDate.replace(' ', 'T').replace('+00:00', 'Z') | 
					
						
							| 
									
										
										
										
											2021-03-14 19:53:22 +00:00
										 |  |  |     return validPostDate(postDate, 90, debug) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  | def parseFeedDate(pubDate: str) -> str: | 
					
						
							|  |  |  |     """Returns a UTC date string based on the given date string
 | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |     This tries a number of formats to see which work | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     formats = ("%a, %d %b %Y %H:%M:%S %z", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S EST", | 
					
						
							|  |  |  |                "%a, %d %b %Y %H:%M:%S UT", | 
					
						
							| 
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 |  |  |                "%a, %d %b %Y %H:%M:%S GMT", | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |                "%Y-%m-%dT%H:%M:%SZ", | 
					
						
							|  |  |  |                "%Y-%m-%dT%H:%M:%S%z") | 
					
						
							|  |  |  |     publishedDate = None | 
					
						
							|  |  |  |     for dateFormat in formats: | 
					
						
							|  |  |  |         if ',' in pubDate and ',' not in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if ',' not in pubDate and ',' in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if 'Z' in pubDate and 'Z' not in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if 'Z' not in pubDate and 'Z' in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if 'EST' not in pubDate and 'EST' in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2021-09-07 19:09:41 +00:00
										 |  |  |         if 'GMT' not in pubDate and 'GMT' in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |         if 'EST' in pubDate and 'EST' not in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if 'UT' not in pubDate and 'UT' in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if 'UT' in pubDate and 'UT' not in dateFormat: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         try: | 
					
						
							|  |  |  |             publishedDate = \ | 
					
						
							| 
									
										
										
										
											2020-11-22 18:43:01 +00:00
										 |  |  |                 datetime.strptime(pubDate, dateFormat) | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  |         except BaseException: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if publishedDate: | 
					
						
							|  |  |  |             if pubDate.endswith(' EST'): | 
					
						
							|  |  |  |                 hoursAdded = timedelta(hours=5) | 
					
						
							|  |  |  |                 publishedDate = publishedDate + hoursAdded | 
					
						
							|  |  |  |             break | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     pubDateStr = None | 
					
						
							|  |  |  |     if publishedDate: | 
					
						
							| 
									
										
										
										
											2020-11-22 20:33:24 +00:00
										 |  |  |         offset = publishedDate.utcoffset() | 
					
						
							| 
									
										
										
										
											2020-11-22 20:37:08 +00:00
										 |  |  |         if offset: | 
					
						
							|  |  |  |             publishedDate = publishedDate - offset | 
					
						
							| 
									
										
										
										
											2020-11-22 19:09:35 +00:00
										 |  |  |         # convert local date to UTC | 
					
						
							|  |  |  |         publishedDate = publishedDate.replace(tzinfo=timezone.utc) | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  |         pubDateStr = str(publishedDate) | 
					
						
							|  |  |  |         if not pubDateStr.endswith('+00:00'): | 
					
						
							|  |  |  |             pubDateStr += '+00:00' | 
					
						
							| 
									
										
										
										
											2021-09-07 19:33:27 +00:00
										 |  |  |     else: | 
					
						
							|  |  |  |         print('WARN: unrecognized date format: ' + pubDate) | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     return pubDateStr | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 |  |  | def loadHashtagCategories(baseDir: str, language: str) -> None: | 
					
						
							|  |  |  |     """Loads an rss file containing hashtag categories
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     hashtagCategoriesFilename = baseDir + '/categories.xml' | 
					
						
							|  |  |  |     if not os.path.isfile(hashtagCategoriesFilename): | 
					
						
							|  |  |  |         hashtagCategoriesFilename = \ | 
					
						
							|  |  |  |             baseDir + '/defaultcategories/' + language + '.xml' | 
					
						
							|  |  |  |         if not os.path.isfile(hashtagCategoriesFilename): | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-06-21 22:52:04 +00:00
										 |  |  |     with open(hashtagCategoriesFilename, 'r') as fp: | 
					
						
							|  |  |  |         xmlStr = fp.read() | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         _xml2StrToHashtagCategories(baseDir, xmlStr, 1024, True) | 
					
						
							| 
									
										
										
										
											2020-12-05 13:38:07 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _xml2StrToHashtagCategories(baseDir: str, xmlStr: str, | 
					
						
							|  |  |  |                                 maxCategoriesFeedItemSizeKb: int, | 
					
						
							| 
									
										
										
										
											2021-06-20 11:28:35 +00:00
										 |  |  |                                 force: bool = False) -> None: | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |     """Updates hashtag categories based upon an rss feed
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     rssItems = xmlStr.split('<item>') | 
					
						
							|  |  |  |     maxBytes = maxCategoriesFeedItemSizeKb * 1024 | 
					
						
							|  |  |  |     for rssItem in rssItems: | 
					
						
							|  |  |  |         if not rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(rssItem) > maxBytes: | 
					
						
							|  |  |  |             print('WARN: rss categories feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<description>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</description>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         categoryStr = rssItem.split('<title>')[1] | 
					
						
							|  |  |  |         categoryStr = categoryStr.split('</title>')[0].strip() | 
					
						
							|  |  |  |         if not categoryStr: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 |  |  |         if 'CDATA' in categoryStr: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |         hashtagListStr = rssItem.split('<description>')[1] | 
					
						
							|  |  |  |         hashtagListStr = hashtagListStr.split('</description>')[0].strip() | 
					
						
							|  |  |  |         if not hashtagListStr: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-12-03 10:12:09 +00:00
										 |  |  |         if 'CDATA' in hashtagListStr: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |         hashtagList = hashtagListStr.split(' ') | 
					
						
							| 
									
										
										
										
											2020-12-02 22:40:46 +00:00
										 |  |  |         if not isBlockedHashtag(baseDir, categoryStr): | 
					
						
							|  |  |  |             for hashtag in hashtagList: | 
					
						
							| 
									
										
										
										
											2021-08-15 11:39:20 +00:00
										 |  |  |                 setHashtagCategory(baseDir, hashtag, categoryStr, | 
					
						
							|  |  |  |                                    False, force) | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _xml2StrToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                    moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                    maxPostsPerSource: int, | 
					
						
							|  |  |  |                    maxFeedItemSizeKb: int, | 
					
						
							|  |  |  |                    maxCategoriesFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     """Converts an xml RSS 2.0 string to a dictionary
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     if '<item>' not in xmlStr: | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # is this an rss feed containing hashtag categories? | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |     if '<title>#categories</title>' in xmlStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         _xml2StrToHashtagCategories(baseDir, xmlStr, | 
					
						
							|  |  |  |                                     maxCategoriesFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-12-02 16:18:36 +00:00
										 |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2020-12-09 10:38:09 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     rssItems = xmlStr.split('<item>') | 
					
						
							| 
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 |  |  |     postCtr = 0 | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |     maxBytes = maxFeedItemSizeKb * 1024 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     for rssItem in rssItems: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |         if not rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |         if len(rssItem) > maxBytes: | 
					
						
							|  |  |  |             print('WARN: rss feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         if '<title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<link>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</link>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<pubDate>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</pubDate>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         title = rssItem.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         title = _removeCDATA(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-02-13 21:48:24 +00:00
										 |  |  |         title = removeHtml(title) | 
					
						
							| 
									
										
										
										
											2020-10-07 12:05:49 +00:00
										 |  |  |         description = '' | 
					
						
							|  |  |  |         if '<description>' in rssItem and '</description>' in rssItem: | 
					
						
							|  |  |  |             description = rssItem.split('<description>')[1] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |             description = removeHtml(description.split('</description>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-21 23:18:34 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             if '<media:description>' in rssItem and \ | 
					
						
							|  |  |  |                '</media:description>' in rssItem: | 
					
						
							|  |  |  |                 description = rssItem.split('<media:description>')[1] | 
					
						
							|  |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |                 description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         link = rssItem.split('<link>')[1] | 
					
						
							|  |  |  |         link = link.split('</link>')[0] | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-17 20:53:36 +00:00
										 |  |  |         itemDomain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in itemDomain: | 
					
						
							|  |  |  |             itemDomain = itemDomain.split('/')[0] | 
					
						
							|  |  |  |         if isBlockedDomain(baseDir, itemDomain): | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         pubDate = rssItem.split('<pubDate>')[1] | 
					
						
							|  |  |  |         pubDate = pubDate.split('</pubDate>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  |         pubDateStr = parseFeedDate(pubDate) | 
					
						
							|  |  |  |         if pubDateStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |             if _validFeedDate(pubDateStr): | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postFilename = '' | 
					
						
							|  |  |  |                 votesStatus = [] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                       result, pubDateStr, | 
					
						
							|  |  |  |                                       title, link, | 
					
						
							|  |  |  |                                       votesStatus, postFilename, | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                                       description, moderated, | 
					
						
							|  |  |  |                                       mirrored) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postCtr += 1 | 
					
						
							|  |  |  |                 if postCtr >= maxPostsPerSource: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |     if postCtr > 0: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         print('Added ' + str(postCtr) + | 
					
						
							|  |  |  |               ' rss 2.0 feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _xml1StrToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                    moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                    maxPostsPerSource: int, | 
					
						
							|  |  |  |                    maxFeedItemSizeKb: int, | 
					
						
							|  |  |  |                    maxCategoriesFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     """Converts an xml RSS 1.0 string to a dictionary
 | 
					
						
							|  |  |  |     https://validator.w3.org/feed/docs/rss1.html | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 |  |  |     itemStr = '<item' | 
					
						
							|  |  |  |     if itemStr not in xmlStr: | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # is this an rss feed containing hashtag categories? | 
					
						
							|  |  |  |     if '<title>#categories</title>' in xmlStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         _xml2StrToHashtagCategories(baseDir, xmlStr, | 
					
						
							|  |  |  |                                     maxCategoriesFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 |  |  |     rssItems = xmlStr.split(itemStr) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     postCtr = 0 | 
					
						
							|  |  |  |     maxBytes = maxFeedItemSizeKb * 1024 | 
					
						
							|  |  |  |     for rssItem in rssItems: | 
					
						
							|  |  |  |         if not rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(rssItem) > maxBytes: | 
					
						
							| 
									
										
										
										
											2020-12-14 17:18:16 +00:00
										 |  |  |             print('WARN: rss 1.0 feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if rssItem.startswith('s>'): | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |             continue | 
					
						
							|  |  |  |         if '<title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</title>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<link>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</link>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '<dc:date>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if '</dc:date>' not in rssItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         title = rssItem.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         title = _removeCDATA(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-02-13 21:48:24 +00:00
										 |  |  |         title = removeHtml(title) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         description = '' | 
					
						
							|  |  |  |         if '<description>' in rssItem and '</description>' in rssItem: | 
					
						
							|  |  |  |             description = rssItem.split('<description>')[1] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |             description = removeHtml(description.split('</description>')[0]) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             if '<media:description>' in rssItem and \ | 
					
						
							|  |  |  |                '</media:description>' in rssItem: | 
					
						
							|  |  |  |                 description = rssItem.split('<media:description>')[1] | 
					
						
							|  |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |                 description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |         link = rssItem.split('<link>')[1] | 
					
						
							|  |  |  |         link = link.split('</link>')[0] | 
					
						
							|  |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         itemDomain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in itemDomain: | 
					
						
							|  |  |  |             itemDomain = itemDomain.split('/')[0] | 
					
						
							|  |  |  |         if isBlockedDomain(baseDir, itemDomain): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         pubDate = rssItem.split('<dc:date>')[1] | 
					
						
							|  |  |  |         pubDate = pubDate.split('</dc:date>')[0] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         pubDateStr = parseFeedDate(pubDate) | 
					
						
							|  |  |  |         if pubDateStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |             if _validFeedDate(pubDateStr): | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postFilename = '' | 
					
						
							|  |  |  |                 votesStatus = [] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                       result, pubDateStr, | 
					
						
							|  |  |  |                                       title, link, | 
					
						
							|  |  |  |                                       votesStatus, postFilename, | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                                       description, moderated, | 
					
						
							|  |  |  |                                       mirrored) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postCtr += 1 | 
					
						
							|  |  |  |                 if postCtr >= maxPostsPerSource: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     if postCtr > 0: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         print('Added ' + str(postCtr) + | 
					
						
							|  |  |  |               ' rss 1.0 feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-12-14 14:22:44 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _atomFeedToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                     moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                     maxPostsPerSource: int, | 
					
						
							|  |  |  |                     maxFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |     """Converts an atom feed string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if '<entry>' not in xmlStr: | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |     atomItems = xmlStr.split('<entry>') | 
					
						
							| 
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 |  |  |     postCtr = 0 | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |     maxBytes = maxFeedItemSizeKb * 1024 | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |     for atomItem in atomItems: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |         if not atomItem: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if len(atomItem) > maxBytes: | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |             print('WARN: atom feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<title>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '</title>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<link>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '</link>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<updated>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '</updated>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         title = atomItem.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         title = _removeCDATA(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2021-02-13 21:48:24 +00:00
										 |  |  |         title = removeHtml(title) | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<summary>' in atomItem and '</summary>' in atomItem: | 
					
						
							|  |  |  |             description = atomItem.split('<summary>')[1] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |             description = removeHtml(description.split('</summary>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 |  |  |         else: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |             if '<media:description>' in atomItem and \ | 
					
						
							|  |  |  |                '</media:description>' in atomItem: | 
					
						
							|  |  |  |                 description = atomItem.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-21 23:29:46 +00:00
										 |  |  |                 description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |                 description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         link = atomItem.split('<link>')[1] | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |         link = link.split('</link>')[0] | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-17 20:53:36 +00:00
										 |  |  |         itemDomain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in itemDomain: | 
					
						
							|  |  |  |             itemDomain = itemDomain.split('/')[0] | 
					
						
							|  |  |  |         if isBlockedDomain(baseDir, itemDomain): | 
					
						
							| 
									
										
										
										
											2020-10-16 11:58:31 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         pubDate = atomItem.split('<updated>')[1] | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |         pubDate = pubDate.split('</updated>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  |         pubDateStr = parseFeedDate(pubDate) | 
					
						
							|  |  |  |         if pubDateStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |             if _validFeedDate(pubDateStr): | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postFilename = '' | 
					
						
							|  |  |  |                 votesStatus = [] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                       result, pubDateStr, | 
					
						
							|  |  |  |                                       title, link, | 
					
						
							|  |  |  |                                       votesStatus, postFilename, | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |                                       description, moderated, | 
					
						
							|  |  |  |                                       mirrored) | 
					
						
							|  |  |  |                 postCtr += 1 | 
					
						
							|  |  |  |                 if postCtr >= maxPostsPerSource: | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |     if postCtr > 0: | 
					
						
							|  |  |  |         print('Added ' + str(postCtr) + | 
					
						
							|  |  |  |               ' atom feed items to newswire') | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:30:23 +00:00
										 |  |  | def _jsonFeedV1ToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                       moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                       maxPostsPerSource: int, | 
					
						
							|  |  |  |                       maxFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     """Converts a json feed string to a dictionary
 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 |  |  |     See https://jsonfeed.org/version/1.1 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 |  |  |     if '"items"' not in xmlStr: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     try: | 
					
						
							|  |  |  |         feedJson = json.loads(xmlStr) | 
					
						
							|  |  |  |     except BaseException: | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     maxBytes = maxFeedItemSizeKb * 1024 | 
					
						
							|  |  |  |     if not feedJson.get('version'): | 
					
						
							|  |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2021-02-12 11:46:26 +00:00
										 |  |  |     if not feedJson['version'].startswith('https://jsonfeed.org/version/1'): | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         return {} | 
					
						
							|  |  |  |     if not feedJson.get('items'): | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     if not isinstance(feedJson['items'], list): | 
					
						
							|  |  |  |         return {} | 
					
						
							| 
									
										
										
										
											2021-02-12 11:50:05 +00:00
										 |  |  |     postCtr = 0 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:47:49 +00:00
										 |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     for jsonFeedItem in feedJson['items']: | 
					
						
							|  |  |  |         if not jsonFeedItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not isinstance(jsonFeedItem, dict): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not jsonFeedItem.get('url'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not isinstance(jsonFeedItem['url'], str): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not jsonFeedItem.get('date_published'): | 
					
						
							|  |  |  |             if not jsonFeedItem.get('date_modified'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |         if not jsonFeedItem.get('content_text'): | 
					
						
							|  |  |  |             if not jsonFeedItem.get('content_html'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |         if jsonFeedItem.get('content_html'): | 
					
						
							|  |  |  |             if not isinstance(jsonFeedItem['content_html'], str): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             title = removeHtml(jsonFeedItem['content_html']) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             if not isinstance(jsonFeedItem['content_text'], str): | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-02-12 12:02:09 +00:00
										 |  |  |             title = removeHtml(jsonFeedItem['content_text']) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         if len(title) > maxBytes: | 
					
						
							|  |  |  |             print('WARN: json feed title is too long') | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         description = '' | 
					
						
							|  |  |  |         if jsonFeedItem.get('description'): | 
					
						
							|  |  |  |             if not isinstance(jsonFeedItem['description'], str): | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-02-12 12:02:09 +00:00
										 |  |  |             description = removeHtml(jsonFeedItem['description']) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |             if len(description) > maxBytes: | 
					
						
							|  |  |  |                 print('WARN: json feed description is too long') | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 |  |  |             if jsonFeedItem.get('tags'): | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:48 +00:00
										 |  |  |                 if isinstance(jsonFeedItem['tags'], list): | 
					
						
							| 
									
										
										
										
											2021-02-12 12:09:16 +00:00
										 |  |  |                     for tagName in jsonFeedItem['tags']: | 
					
						
							|  |  |  |                         if not isinstance(tagName, str): | 
					
						
							|  |  |  |                             continue | 
					
						
							|  |  |  |                         if ' ' in tagName: | 
					
						
							|  |  |  |                             continue | 
					
						
							|  |  |  |                         if not tagName.startswith('#'): | 
					
						
							|  |  |  |                             tagName = '#' + tagName | 
					
						
							|  |  |  |                         if tagName not in description: | 
					
						
							|  |  |  |                             description += ' ' + tagName | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         link = jsonFeedItem['url'] | 
					
						
							|  |  |  |         if '://' not in link: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if len(link) > maxBytes: | 
					
						
							|  |  |  |             print('WARN: json feed link is too long') | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         itemDomain = link.split('://')[1] | 
					
						
							|  |  |  |         if '/' in itemDomain: | 
					
						
							|  |  |  |             itemDomain = itemDomain.split('/')[0] | 
					
						
							|  |  |  |         if isBlockedDomain(baseDir, itemDomain): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if jsonFeedItem.get('date_published'): | 
					
						
							|  |  |  |             if not isinstance(jsonFeedItem['date_published'], str): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             pubDate = jsonFeedItem['date_published'] | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             if not isinstance(jsonFeedItem['date_modified'], str): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             pubDate = jsonFeedItem['date_modified'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         pubDateStr = parseFeedDate(pubDate) | 
					
						
							|  |  |  |         if pubDateStr: | 
					
						
							|  |  |  |             if _validFeedDate(pubDateStr): | 
					
						
							|  |  |  |                 postFilename = '' | 
					
						
							|  |  |  |                 votesStatus = [] | 
					
						
							|  |  |  |                 _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                       result, pubDateStr, | 
					
						
							|  |  |  |                                       title, link, | 
					
						
							|  |  |  |                                       votesStatus, postFilename, | 
					
						
							|  |  |  |                                       description, moderated, | 
					
						
							|  |  |  |                                       mirrored) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postCtr += 1 | 
					
						
							|  |  |  |                 if postCtr >= maxPostsPerSource: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |     if postCtr > 0: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |         print('Added ' + str(postCtr) + | 
					
						
							|  |  |  |               ' json feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-10-10 12:24:14 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                       moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                       maxPostsPerSource: int, | 
					
						
							|  |  |  |                       maxFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |     """Converts an atom-style YouTube feed string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if '<entry>' not in xmlStr: | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     if isBlockedDomain(baseDir, 'www.youtube.com'): | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |     atomItems = xmlStr.split('<entry>') | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |     postCtr = 0 | 
					
						
							|  |  |  |     maxBytes = maxFeedItemSizeKb * 1024 | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |     for atomItem in atomItems: | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |         if not atomItem: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not atomItem.strip(): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if len(atomItem) > maxBytes: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             print('WARN: atom feed item is too big') | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<title>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '</title>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-28 20:46:52 +00:00
										 |  |  |         if '<published>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-28 20:46:52 +00:00
										 |  |  |         if '</published>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<yt:videoId>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '</yt:videoId>' not in atomItem: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         title = atomItem.split('<title>')[1] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         title = _removeCDATA(title.split('</title>')[0]) | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |         description = '' | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         if '<media:description>' in atomItem and \ | 
					
						
							|  |  |  |            '</media:description>' in atomItem: | 
					
						
							|  |  |  |             description = atomItem.split('<media:description>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             description = description.split('</media:description>')[0] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |             description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         elif '<summary>' in atomItem and '</summary>' in atomItem: | 
					
						
							|  |  |  |             description = atomItem.split('<summary>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |             description = description.split('</summary>')[0] | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |             description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-11-22 12:41:54 +00:00
										 |  |  |         link = atomItem.split('<yt:videoId>')[1] | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |         link = link.split('</yt:videoId>')[0] | 
					
						
							|  |  |  |         link = 'https://www.youtube.com/watch?v=' + link.strip() | 
					
						
							| 
									
										
										
										
											2020-11-28 20:46:52 +00:00
										 |  |  |         pubDate = atomItem.split('<published>')[1] | 
					
						
							|  |  |  |         pubDate = pubDate.split('</published>')[0] | 
					
						
							| 
									
										
										
										
											2020-11-22 18:14:40 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-11-22 19:01:18 +00:00
										 |  |  |         pubDateStr = parseFeedDate(pubDate) | 
					
						
							|  |  |  |         if pubDateStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |             if _validFeedDate(pubDateStr): | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postFilename = '' | 
					
						
							|  |  |  |                 votesStatus = [] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                       result, pubDateStr, | 
					
						
							|  |  |  |                                       title, link, | 
					
						
							|  |  |  |                                       votesStatus, postFilename, | 
					
						
							|  |  |  |                                       description, moderated, mirrored) | 
					
						
							| 
									
										
										
										
											2020-12-21 12:11:45 +00:00
										 |  |  |                 postCtr += 1 | 
					
						
							|  |  |  |                 if postCtr >= maxPostsPerSource: | 
					
						
							|  |  |  |                     break | 
					
						
							| 
									
										
										
										
											2020-11-27 22:43:34 +00:00
										 |  |  |     if postCtr > 0: | 
					
						
							|  |  |  |         print('Added ' + str(postCtr) + ' YouTube feed items to newswire') | 
					
						
							| 
									
										
										
										
											2020-11-22 10:34:42 +00:00
										 |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _xmlStrToDict(baseDir: str, domain: str, xmlStr: str, | 
					
						
							|  |  |  |                   moderated: bool, mirrored: bool, | 
					
						
							|  |  |  |                   maxPostsPerSource: int, | 
					
						
							|  |  |  |                   maxFeedItemSizeKb: int, | 
					
						
							|  |  |  |                   maxCategoriesFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """Converts an xml string to a dictionary
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-11-22 16:10:58 +00:00
										 |  |  |     if '<yt:videoId>' in xmlStr and '<yt:channelId>' in xmlStr: | 
					
						
							|  |  |  |         print('YouTube feed: reading') | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         return _atomFeedYTToDict(baseDir, domain, | 
					
						
							|  |  |  |                                  xmlStr, moderated, mirrored, | 
					
						
							|  |  |  |                                  maxPostsPerSource, maxFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-11-22 16:10:58 +00:00
										 |  |  |     elif 'rss version="2.0"' in xmlStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         return _xml2StrToDict(baseDir, domain, | 
					
						
							|  |  |  |                               xmlStr, moderated, mirrored, | 
					
						
							|  |  |  |                               maxPostsPerSource, maxFeedItemSizeKb, | 
					
						
							|  |  |  |                               maxCategoriesFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-12-14 20:22:05 +00:00
										 |  |  |     elif '<?xml version="1.0"' in xmlStr: | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |         return _xml1StrToDict(baseDir, domain, | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |                               xmlStr, moderated, mirrored, | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                               maxPostsPerSource, maxFeedItemSizeKb, | 
					
						
							|  |  |  |                               maxCategoriesFeedItemSizeKb) | 
					
						
							|  |  |  |     elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr: | 
					
						
							|  |  |  |         return _atomFeedToDict(baseDir, domain, | 
					
						
							|  |  |  |                                xmlStr, moderated, mirrored, | 
					
						
							|  |  |  |                                maxPostsPerSource, maxFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2021-02-12 11:28:00 +00:00
										 |  |  |     elif 'https://jsonfeed.org/version/1' in xmlStr: | 
					
						
							| 
									
										
										
										
											2021-02-12 11:30:23 +00:00
										 |  |  |         return _jsonFeedV1ToDict(baseDir, domain, | 
					
						
							|  |  |  |                                  xmlStr, moderated, mirrored, | 
					
						
							|  |  |  |                                  maxPostsPerSource, maxFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return {} | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _YTchannelToAtomFeed(url: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 |  |  |     """Converts a YouTube channel url into an atom feed url
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if 'youtube.com/channel/' not in url: | 
					
						
							|  |  |  |         return url | 
					
						
							| 
									
										
										
										
											2020-11-22 12:27:42 +00:00
										 |  |  |     channelId = url.split('youtube.com/channel/')[1].strip() | 
					
						
							| 
									
										
										
										
											2020-11-22 12:36:21 +00:00
										 |  |  |     channelUrl = \ | 
					
						
							|  |  |  |         'https://www.youtube.com/feeds/videos.xml?channel_id=' + channelId | 
					
						
							|  |  |  |     print('YouTube feed: ' + channelUrl) | 
					
						
							|  |  |  |     return channelUrl | 
					
						
							| 
									
										
										
										
											2020-11-22 10:46:54 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 14:37:17 +00:00
										 |  |  | def getRSS(baseDir: str, domain: str, session, url: str, | 
					
						
							|  |  |  |            moderated: bool, mirrored: bool, | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |            maxPostsPerSource: int, maxFeedSizeKb: int, | 
					
						
							| 
									
										
										
										
											2020-12-02 17:02:32 +00:00
										 |  |  |            maxFeedItemSizeKb: int, | 
					
						
							|  |  |  |            maxCategoriesFeedItemSizeKb: int) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """Returns an RSS url as a dict
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not isinstance(url, str): | 
					
						
							|  |  |  |         print('url: ' + str(url)) | 
					
						
							|  |  |  |         print('ERROR: getRSS url should be a string') | 
					
						
							|  |  |  |         return None | 
					
						
							|  |  |  |     headers = { | 
					
						
							| 
									
										
										
										
											2020-12-14 20:22:05 +00:00
										 |  |  |         'Accept': 'text/xml, application/xml; charset=UTF-8' | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     } | 
					
						
							|  |  |  |     params = None | 
					
						
							|  |  |  |     sessionParams = {} | 
					
						
							|  |  |  |     sessionHeaders = {} | 
					
						
							|  |  |  |     if headers: | 
					
						
							|  |  |  |         sessionHeaders = headers | 
					
						
							|  |  |  |     if params: | 
					
						
							|  |  |  |         sessionParams = params | 
					
						
							|  |  |  |     sessionHeaders['User-Agent'] = \ | 
					
						
							|  |  |  |         'Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0' | 
					
						
							|  |  |  |     if not session: | 
					
						
							|  |  |  |         print('WARN: no session specified for getRSS') | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |     url = _YTchannelToAtomFeed(url) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     try: | 
					
						
							|  |  |  |         result = session.get(url, headers=sessionHeaders, params=sessionParams) | 
					
						
							| 
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 |  |  |         if result: | 
					
						
							| 
									
										
										
										
											2020-10-16 12:03:56 +00:00
										 |  |  |             if int(len(result.text) / 1024) < maxFeedSizeKb and \ | 
					
						
							|  |  |  |                not containsInvalidChars(result.text): | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 return _xmlStrToDict(baseDir, domain, result.text, | 
					
						
							|  |  |  |                                      moderated, mirrored, | 
					
						
							|  |  |  |                                      maxPostsPerSource, | 
					
						
							|  |  |  |                                      maxFeedItemSizeKb, | 
					
						
							|  |  |  |                                      maxCategoriesFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-10-16 11:40:01 +00:00
										 |  |  |             else: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:43:22 +00:00
										 |  |  |                 print('WARN: feed is too large, ' + | 
					
						
							|  |  |  |                       'or contains invalid characters: ' + url) | 
					
						
							| 
									
										
										
										
											2020-11-22 13:04:58 +00:00
										 |  |  |         else: | 
					
						
							|  |  |  |             print('WARN: no result returned for feed ' + url) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     except requests.exceptions.RequestException as e: | 
					
						
							| 
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 |  |  |         print('WARN: getRSS failed\nurl: ' + str(url) + ', ' + | 
					
						
							|  |  |  |               'headers: ' + str(sessionHeaders) + ', ' + | 
					
						
							|  |  |  |               'params: ' + str(sessionParams) + ', ' + str(e)) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     except ValueError as e: | 
					
						
							| 
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 |  |  |         print('WARN: getRSS failed\nurl: ' + str(url) + ', ' + | 
					
						
							|  |  |  |               'headers: ' + str(sessionHeaders) + ', ' + | 
					
						
							|  |  |  |               'params: ' + str(sessionParams) + ', ' + str(e)) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     except SocketError as e: | 
					
						
							|  |  |  |         if e.errno == errno.ECONNRESET: | 
					
						
							| 
									
										
										
										
											2021-05-20 12:52:13 +00:00
										 |  |  |             print('WARN: connection was reset during getRSS ' + str(e)) | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             print('WARN: getRSS, ' + str(e)) | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return None | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  | def getRSSfromDict(baseDir: str, newswire: {}, | 
					
						
							|  |  |  |                    httpPrefix: str, domainFull: str, | 
					
						
							|  |  |  |                    title: str, translate: {}) -> str: | 
					
						
							|  |  |  |     """Returns an rss feed from the current newswire dict.
 | 
					
						
							|  |  |  |     This allows other instances to subscribe to the same newswire | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     rssStr = rss2Header(httpPrefix, | 
					
						
							|  |  |  |                         None, domainFull, | 
					
						
							|  |  |  |                         'Newswire', translate) | 
					
						
							| 
									
										
										
										
											2020-11-03 14:41:28 +00:00
										 |  |  |     if not newswire: | 
					
						
							|  |  |  |         return '' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     for published, fields in newswire.items(): | 
					
						
							| 
									
										
										
										
											2020-10-20 12:22:52 +00:00
										 |  |  |         if '+00:00' in published: | 
					
						
							|  |  |  |             published = published.replace('+00:00', 'Z').strip() | 
					
						
							|  |  |  |             published = published.replace(' ', 'T') | 
					
						
							|  |  |  |         else: | 
					
						
							|  |  |  |             publishedWithOffset = \ | 
					
						
							| 
									
										
										
										
											2020-10-20 12:37:32 +00:00
										 |  |  |                 datetime.strptime(published, "%Y-%m-%d %H:%M:%S%z") | 
					
						
							| 
									
										
										
										
											2020-10-20 12:22:52 +00:00
										 |  |  |             published = publishedWithOffset.strftime("%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 |  |  |         try: | 
					
						
							| 
									
										
										
										
											2020-10-04 22:12:27 +00:00
										 |  |  |             pubDate = datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ") | 
					
						
							| 
									
										
										
										
											2020-10-20 12:28:15 +00:00
										 |  |  |         except Exception as e: | 
					
						
							|  |  |  |             print('WARN: Unable to convert date ' + published + ' ' + str(e)) | 
					
						
							| 
									
										
										
										
											2020-10-04 22:08:13 +00:00
										 |  |  |             continue | 
					
						
							| 
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 |  |  |         rssStr += \ | 
					
						
							|  |  |  |             '<item>\n' + \ | 
					
						
							|  |  |  |             '  <title>' + fields[0] + '</title>\n' | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |         description = removeHtml(firstParagraphFromString(fields[4])) | 
					
						
							| 
									
										
										
										
											2020-11-08 10:45:33 +00:00
										 |  |  |         rssStr += '  <description>' + description + '</description>\n' | 
					
						
							| 
									
										
										
										
											2020-10-08 15:07:06 +00:00
										 |  |  |         url = fields[1] | 
					
						
							| 
									
										
										
										
											2020-11-08 11:04:52 +00:00
										 |  |  |         if '://' not in url: | 
					
						
							|  |  |  |             if domainFull not in url: | 
					
						
							|  |  |  |                 url = httpPrefix + '://' + domainFull + url | 
					
						
							| 
									
										
										
										
											2020-10-08 15:07:06 +00:00
										 |  |  |         rssStr += '  <link>' + url + '</link>\n' | 
					
						
							| 
									
										
										
										
											2020-10-04 22:12:27 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |         rssDateStr = pubDate.strftime("%a, %d %b %Y %H:%M:%S UT") | 
					
						
							| 
									
										
										
										
											2021-07-04 11:02:08 +00:00
										 |  |  |         rssStr += \ | 
					
						
							|  |  |  |             '  <pubDate>' + rssDateStr + '</pubDate>\n' + \ | 
					
						
							|  |  |  |             '</item>\n' | 
					
						
							| 
									
										
										
										
											2020-10-04 12:29:07 +00:00
										 |  |  |     rssStr += rss2Footer() | 
					
						
							|  |  |  |     return rssStr | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _isNewswireBlogPost(postJsonObject: {}) -> bool: | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     """Is the given object a blog post?
 | 
					
						
							| 
									
										
										
										
											2020-10-25 10:47:39 +00:00
										 |  |  |     There isn't any difference between a blog post and a newswire blog post | 
					
						
							|  |  |  |     but we may here need to check for different properties than | 
					
						
							|  |  |  |     isBlogPost does | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     """
 | 
					
						
							|  |  |  |     if not postJsonObject: | 
					
						
							|  |  |  |         return False | 
					
						
							| 
									
										
										
										
											2021-06-22 15:45:59 +00:00
										 |  |  |     if not hasObjectDict(postJsonObject): | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |         return False | 
					
						
							|  |  |  |     if postJsonObject['object'].get('summary') and \ | 
					
						
							|  |  |  |        postJsonObject['object'].get('url') and \ | 
					
						
							| 
									
										
										
										
											2020-11-08 09:47:01 +00:00
										 |  |  |        postJsonObject['object'].get('content') and \ | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |        postJsonObject['object'].get('published'): | 
					
						
							| 
									
										
										
										
											2020-10-25 10:42:38 +00:00
										 |  |  |         return isPublicPost(postJsonObject) | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _getHashtagsFromPost(postJsonObject: {}) -> []: | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |     """Returns a list of any hashtags within a post
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-06-22 15:45:59 +00:00
										 |  |  |     if not hasObjectDict(postJsonObject): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         return [] | 
					
						
							|  |  |  |     if not postJsonObject['object'].get('tag'): | 
					
						
							|  |  |  |         return [] | 
					
						
							| 
									
										
										
										
											2020-10-18 09:28:43 +00:00
										 |  |  |     if not isinstance(postJsonObject['object']['tag'], list): | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         return [] | 
					
						
							|  |  |  |     tags = [] | 
					
						
							| 
									
										
										
										
											2020-10-18 09:28:43 +00:00
										 |  |  |     for tg in postJsonObject['object']['tag']: | 
					
						
							| 
									
										
										
										
											2020-10-16 20:13:23 +00:00
										 |  |  |         if not isinstance(tg, dict): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not tg.get('name'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not tg.get('type'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if tg['type'] != 'Hashtag': | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if tg['name'] not in tags: | 
					
						
							|  |  |  |             tags.append(tg['name']) | 
					
						
							|  |  |  |     return tags | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _addAccountBlogsToNewswire(baseDir: str, nickname: str, domain: str, | 
					
						
							|  |  |  |                                newswire: {}, | 
					
						
							|  |  |  |                                maxBlogsPerAccount: int, | 
					
						
							|  |  |  |                                indexFilename: str, | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                                maxTags: int, systemLanguage: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     """Adds blogs for the given account to the newswire
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not os.path.isfile(indexFilename): | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  |     # local blog entries are unmoderated by default | 
					
						
							|  |  |  |     moderated = False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # local blogs can potentially be moderated | 
					
						
							|  |  |  |     moderatedFilename = \ | 
					
						
							| 
									
										
										
										
											2021-07-13 21:59:53 +00:00
										 |  |  |         acctDir(baseDir, nickname, domain) + '/.newswiremoderated' | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  |     if os.path.isfile(moderatedFilename): | 
					
						
							|  |  |  |         moderated = True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     with open(indexFilename, 'r') as indexFile: | 
					
						
							|  |  |  |         postFilename = 'start' | 
					
						
							|  |  |  |         ctr = 0 | 
					
						
							|  |  |  |         while postFilename: | 
					
						
							|  |  |  |             postFilename = indexFile.readline() | 
					
						
							|  |  |  |             if postFilename: | 
					
						
							|  |  |  |                 # if this is a full path then remove the directories | 
					
						
							|  |  |  |                 if '/' in postFilename: | 
					
						
							|  |  |  |                     postFilename = postFilename.split('/')[-1] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # filename of the post without any extension or path | 
					
						
							|  |  |  |                 # This should also correspond to any index entry in | 
					
						
							|  |  |  |                 # the posts cache | 
					
						
							|  |  |  |                 postUrl = \ | 
					
						
							|  |  |  |                     postFilename.replace('\n', '').replace('\r', '') | 
					
						
							|  |  |  |                 postUrl = postUrl.replace('.json', '').strip() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 # read the post from file | 
					
						
							|  |  |  |                 fullPostFilename = \ | 
					
						
							|  |  |  |                     locatePost(baseDir, nickname, | 
					
						
							|  |  |  |                                domain, postUrl, False) | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  |                 if not fullPostFilename: | 
					
						
							| 
									
										
										
										
											2021-02-11 12:40:56 +00:00
										 |  |  |                     print('Unable to locate post for newswire ' + postUrl) | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  |                     ctr += 1 | 
					
						
							|  |  |  |                     if ctr >= maxBlogsPerAccount: | 
					
						
							|  |  |  |                         break | 
					
						
							| 
									
										
										
										
											2020-10-06 13:34:04 +00:00
										 |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2020-10-06 13:05:15 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 postJsonObject = None | 
					
						
							|  |  |  |                 if fullPostFilename: | 
					
						
							|  |  |  |                     postJsonObject = loadJson(fullPostFilename) | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 if _isNewswireBlogPost(postJsonObject): | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |                     published = postJsonObject['object']['published'] | 
					
						
							|  |  |  |                     published = published.replace('T', ' ') | 
					
						
							|  |  |  |                     published = published.replace('Z', '+00:00') | 
					
						
							| 
									
										
										
										
											2020-10-06 20:17:34 +00:00
										 |  |  |                     votes = [] | 
					
						
							|  |  |  |                     if os.path.isfile(fullPostFilename + '.votes'): | 
					
						
							|  |  |  |                         votes = loadJson(fullPostFilename + '.votes') | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                     content = \ | 
					
						
							| 
									
										
										
										
											2021-07-20 13:33:27 +00:00
										 |  |  |                         getBaseContentFromPost(postJsonObject, systemLanguage) | 
					
						
							| 
									
										
										
										
											2020-11-08 10:45:33 +00:00
										 |  |  |                     description = firstParagraphFromString(content) | 
					
						
							| 
									
										
										
										
											2021-01-11 21:54:25 +00:00
										 |  |  |                     description = removeHtml(description) | 
					
						
							| 
									
										
										
										
											2020-12-22 21:24:46 +00:00
										 |  |  |                     tagsFromPost = _getHashtagsFromPost(postJsonObject) | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                     _addNewswireDictEntry(baseDir, domain, | 
					
						
							|  |  |  |                                           newswire, published, | 
					
						
							|  |  |  |                                           postJsonObject['object']['summary'], | 
					
						
							|  |  |  |                                           postJsonObject['object']['url'], | 
					
						
							|  |  |  |                                           votes, fullPostFilename, | 
					
						
							|  |  |  |                                           description, moderated, False, | 
					
						
							| 
									
										
										
										
											2020-12-22 21:24:46 +00:00
										 |  |  |                                           tagsFromPost, | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                                           maxTags) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |             ctr += 1 | 
					
						
							|  |  |  |             if ctr >= maxBlogsPerAccount: | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  | def _addBlogsToNewswire(baseDir: str, domain: str, newswire: {}, | 
					
						
							|  |  |  |                         maxBlogsPerAccount: int, | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                         maxTags: int, systemLanguage: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 |  |  |     """Adds blogs from each user account into the newswire
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 |  |  |     moderationDict = {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     # go through each account | 
					
						
							|  |  |  |     for subdir, dirs, files in os.walk(baseDir + '/accounts'): | 
					
						
							|  |  |  |         for handle in dirs: | 
					
						
							| 
									
										
										
										
											2021-07-04 17:55:29 +00:00
										 |  |  |             if not isAccountDir(handle): | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-10-06 10:34:56 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:37:22 +00:00
										 |  |  |             nickname = handle.split('@')[0] | 
					
						
							| 
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |             # has this account been suspended? | 
					
						
							| 
									
										
										
										
											2020-10-06 08:58:44 +00:00
										 |  |  |             if isSuspended(baseDir, nickname): | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2020-10-05 11:30:11 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 21:28:40 +00:00
										 |  |  |             if os.path.isfile(baseDir + '/accounts/' + handle + | 
					
						
							|  |  |  |                               '/.nonewswire'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |             # is there a blogs timeline for this account? | 
					
						
							| 
									
										
										
										
											2020-10-06 09:41:04 +00:00
										 |  |  |             accountDir = os.path.join(baseDir + '/accounts', handle) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |             blogsIndex = accountDir + '/tlblogs.index' | 
					
						
							|  |  |  |             if os.path.isfile(blogsIndex): | 
					
						
							|  |  |  |                 domain = handle.split('@')[1] | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |                 _addAccountBlogsToNewswire(baseDir, nickname, domain, | 
					
						
							|  |  |  |                                            newswire, maxBlogsPerAccount, | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                                            blogsIndex, maxTags, | 
					
						
							|  |  |  |                                            systemLanguage) | 
					
						
							| 
									
										
										
										
											2020-12-13 22:13:45 +00:00
										 |  |  |         break | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     # sort the moderation dict into chronological order, latest first | 
					
						
							|  |  |  |     sortedModerationDict = \ | 
					
						
							|  |  |  |         OrderedDict(sorted(moderationDict.items(), reverse=True)) | 
					
						
							| 
									
										
										
										
											2020-10-06 12:15:35 +00:00
										 |  |  |     # save the moderation queue details for later display | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  |     newswireModerationFilename = baseDir + '/accounts/newswiremoderation.txt' | 
					
						
							| 
									
										
										
										
											2020-10-06 14:32:53 +00:00
										 |  |  |     if sortedModerationDict: | 
					
						
							|  |  |  |         saveJson(sortedModerationDict, newswireModerationFilename) | 
					
						
							|  |  |  |     else: | 
					
						
							|  |  |  |         # remove the file if there is nothing to moderate | 
					
						
							|  |  |  |         if os.path.isfile(newswireModerationFilename): | 
					
						
							| 
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 |  |  |             try: | 
					
						
							|  |  |  |                 os.remove(newswireModerationFilename) | 
					
						
							|  |  |  |             except BaseException: | 
					
						
							|  |  |  |                 pass | 
					
						
							| 
									
										
										
										
											2020-10-06 11:28:32 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-17 16:08:07 +00:00
										 |  |  | def getDictFromNewswire(session, baseDir: str, domain: str, | 
					
						
							| 
									
										
										
										
											2020-10-23 14:41:29 +00:00
										 |  |  |                         maxPostsPerSource: int, maxFeedSizeKb: int, | 
					
						
							| 
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 |  |  |                         maxTags: int, maxFeedItemSizeKb: int, | 
					
						
							| 
									
										
										
										
											2020-12-02 17:02:32 +00:00
										 |  |  |                         maxNewswirePosts: int, | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                         maxCategoriesFeedItemSizeKb: int, | 
					
						
							|  |  |  |                         systemLanguage: str) -> {}: | 
					
						
							| 
									
										
										
										
											2020-10-04 09:59:55 +00:00
										 |  |  |     """Gets rss feeds as a dictionary from newswire file
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:59:55 +00:00
										 |  |  |     subscriptionsFilename = baseDir + '/accounts/newswire.txt' | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     if not os.path.isfile(subscriptionsFilename): | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-16 10:13:14 +00:00
										 |  |  |     maxPostsPerSource = 5 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  |     # add rss feeds | 
					
						
							| 
									
										
										
										
											2021-06-21 22:52:04 +00:00
										 |  |  |     rssFeed = [] | 
					
						
							|  |  |  |     with open(subscriptionsFilename, 'r') as fp: | 
					
						
							|  |  |  |         rssFeed = fp.readlines() | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     result = {} | 
					
						
							|  |  |  |     for url in rssFeed: | 
					
						
							|  |  |  |         url = url.strip() | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # Does this contain a url? | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         if '://' not in url: | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # is this a comment? | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |         if url.startswith('#'): | 
					
						
							|  |  |  |             continue | 
					
						
							| 
									
										
										
										
											2020-10-09 10:33:06 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         # should this feed be moderated? | 
					
						
							|  |  |  |         moderated = False | 
					
						
							|  |  |  |         if '*' in url: | 
					
						
							|  |  |  |             moderated = True | 
					
						
							|  |  |  |             url = url.replace('*', '').strip() | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-19 14:37:17 +00:00
										 |  |  |         # should this feed content be mirrored? | 
					
						
							|  |  |  |         mirrored = False | 
					
						
							|  |  |  |         if '!' in url: | 
					
						
							|  |  |  |             mirrored = True | 
					
						
							|  |  |  |             url = url.replace('!', '').strip() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         itemsList = getRSS(baseDir, domain, session, url, | 
					
						
							|  |  |  |                            moderated, mirrored, | 
					
						
							| 
									
										
										
										
											2020-11-03 16:04:25 +00:00
										 |  |  |                            maxPostsPerSource, maxFeedSizeKb, | 
					
						
							| 
									
										
										
										
											2020-12-02 17:02:32 +00:00
										 |  |  |                            maxFeedItemSizeKb, | 
					
						
							|  |  |  |                            maxCategoriesFeedItemSizeKb) | 
					
						
							| 
									
										
										
										
											2020-11-03 15:04:33 +00:00
										 |  |  |         if itemsList: | 
					
						
							|  |  |  |             for dateStr, item in itemsList.items(): | 
					
						
							|  |  |  |                 result[dateStr] = item | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-06 09:47:58 +00:00
										 |  |  |     # add blogs from each user account | 
					
						
							| 
									
										
										
										
											2020-12-22 18:06:23 +00:00
										 |  |  |     _addBlogsToNewswire(baseDir, domain, result, | 
					
						
							| 
									
										
										
										
											2021-07-18 14:15:16 +00:00
										 |  |  |                         maxPostsPerSource, maxTags, systemLanguage) | 
					
						
							| 
									
										
										
										
											2020-10-05 11:11:48 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # sort into chronological order, latest first | 
					
						
							| 
									
										
										
										
											2020-10-04 21:45:46 +00:00
										 |  |  |     sortedResult = OrderedDict(sorted(result.items(), reverse=True)) | 
					
						
							| 
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # are there too many posts? If so then remove the oldest ones | 
					
						
							|  |  |  |     noOfPosts = len(sortedResult.items()) | 
					
						
							|  |  |  |     if noOfPosts > maxNewswirePosts: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 |  |  |         ctr = 0 | 
					
						
							|  |  |  |         removals = [] | 
					
						
							|  |  |  |         for dateStr, item in sortedResult.items(): | 
					
						
							|  |  |  |             ctr += 1 | 
					
						
							| 
									
										
										
										
											2020-11-22 12:25:53 +00:00
										 |  |  |             if ctr > maxNewswirePosts: | 
					
						
							| 
									
										
										
										
											2020-11-22 12:05:15 +00:00
										 |  |  |                 removals.append(dateStr) | 
					
						
							|  |  |  |         for r in removals: | 
					
						
							|  |  |  |             sortedResult.pop(r) | 
					
						
							| 
									
										
										
										
											2020-11-22 11:48:53 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-10-04 09:51:12 +00:00
										 |  |  |     return sortedResult |