Reject blocked domains within news feeds

2020-10-16 12:58:31 +01:00 · 2020-10-16 12:58:31 +01:00 · e5950d4363
parent c3dbec6181
commit e5950d4363
1 changed files with 18 additions and 2 deletions
--- a/newswire.py
+++ b/newswire.py
@ -16,7 +16,8 @@ from utils import locatePost
 from utils import loadJson
 from utils import saveJson
 from utils import isSuspended
-
+from utils import containsInvalidChars
+from blocking import isBlockedDomain

 def rss2Header(httpPrefix: str,
               nickname: str, domainFull: str,
@ -80,6 +81,13 @@ def xml2StrToDict(xmlStr: str, moderated: bool,
            description = description.split('</description>')[0]
        link = rssItem.split('<link>')[1]
        link = link.split('</link>')[0]
+        if '://' not in link:
+            continue
+        domain = link.split('://')[1]
+        if '/' in domain:
+            domain = domain.split('/')[0]
+        if isBlockedDomain(baseDir, domain):
+            continue
        pubDate = rssItem.split('<pubDate>')[1]
        pubDate = pubDate.split('</pubDate>')[0]
        parsed = False
@ -147,6 +155,13 @@ def atomFeedToDict(xmlStr: str, moderated: bool,
            description = description.split('</summary>')[0]
        link = rssItem.split('<link>')[1]
        link = link.split('</link>')[0]
+        if '://' not in link:
+            continue
+        domain = link.split('://')[1]
+        if '/' in domain:
+            domain = domain.split('/')[0]
+        if isBlockedDomain(baseDir, domain):
+            continue
        pubDate = rssItem.split('<updated>')[1]
        pubDate = pubDate.split('</updated>')[0]
        parsed = False
@ -221,7 +236,8 @@ def getRSS(session, url: str, moderated: bool,
    try:
        result = session.get(url, headers=sessionHeaders, params=sessionParams)
        if result:
-            if int(len(result) / 1024) < maxFeedSizeKb:
+            if int(len(result) / 1024) < maxFeedSizeKb and \
+               not containsInvalidChars(result):
                return xmlStrToDict(result.text, moderated, maxPostsPerSource)
            else:
                print('WARN: feed is too large: ' + url)