forked from indymedia/epicyon
Validate newswire item dates
So they can't be in the future or too far in the pastalt-html-css
parent
31a48db4a4
commit
23301b35a5
24
inbox.py
24
inbox.py
|
@ -10,6 +10,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
from utils import validPostDate
|
||||||
from utils import getFullDomain
|
from utils import getFullDomain
|
||||||
from utils import isEventPost
|
from utils import isEventPost
|
||||||
from utils import removeIdEnding
|
from utils import removeIdEnding
|
||||||
|
@ -71,29 +72,6 @@ from delete import removeOldHashtags
|
||||||
from follow import isFollowingActor
|
from follow import isFollowingActor
|
||||||
|
|
||||||
|
|
||||||
def validPostDate(published: str) -> bool:
|
|
||||||
"""Returns true if the published date is recent and is not in the future
|
|
||||||
"""
|
|
||||||
baselineTime = datetime.datetime(1970, 1, 1)
|
|
||||||
|
|
||||||
daysDiff = datetime.datetime.utcnow() - baselineTime
|
|
||||||
nowDaysSinceEpoch = daysDiff.days
|
|
||||||
|
|
||||||
postTimeObject = \
|
|
||||||
datetime.datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
|
|
||||||
daysDiff = postTimeObject - baselineTime
|
|
||||||
postDaysSinceEpoch = daysDiff.days
|
|
||||||
|
|
||||||
if postDaysSinceEpoch > nowDaysSinceEpoch:
|
|
||||||
print("Inbox post has a published date in the future!")
|
|
||||||
return False
|
|
||||||
|
|
||||||
if nowDaysSinceEpoch - postDaysSinceEpoch > 3:
|
|
||||||
print("Inbox post is not recent enough")
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
|
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
|
||||||
"""Tries to guess a category for the given hashtag.
|
"""Tries to guess a category for the given hashtag.
|
||||||
This works by trying to find the longest similar hashtag
|
This works by trying to find the longest similar hashtag
|
||||||
|
|
92
newswire.py
92
newswire.py
|
@ -14,6 +14,7 @@ from datetime import datetime
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from datetime import timezone
|
from datetime import timezone
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from utils import validPostDate
|
||||||
from utils import setHashtagCategory
|
from utils import setHashtagCategory
|
||||||
from utils import firstParagraphFromString
|
from utils import firstParagraphFromString
|
||||||
from utils import isPublicPost
|
from utils import isPublicPost
|
||||||
|
@ -142,6 +143,13 @@ def addNewswireDictEntry(baseDir: str, domain: str,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def validFeedDate(pubDate: str) -> bool:
|
||||||
|
# convert from YY-MM-DD HH:MM:SS+00:00 to
|
||||||
|
# YY-MM-DDTHH:MM:SSZ
|
||||||
|
postDate = pubDate.replace(' ', 'T').replace('+00:00', 'Z')
|
||||||
|
return validPostDate(postDate, 30)
|
||||||
|
|
||||||
|
|
||||||
def parseFeedDate(pubDate: str) -> str:
|
def parseFeedDate(pubDate: str) -> str:
|
||||||
"""Returns a UTC date string based on the given date string
|
"""Returns a UTC date string based on the given date string
|
||||||
This tries a number of formats to see which work
|
This tries a number of formats to see which work
|
||||||
|
@ -317,16 +325,17 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
|
||||||
pubDateStr = parseFeedDate(pubDate)
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
if pubDateStr:
|
if pubDateStr:
|
||||||
postFilename = ''
|
if validFeedDate(pubDateStr):
|
||||||
votesStatus = []
|
postFilename = ''
|
||||||
addNewswireDictEntry(baseDir, domain,
|
votesStatus = []
|
||||||
result, pubDateStr,
|
addNewswireDictEntry(baseDir, domain,
|
||||||
title, link,
|
result, pubDateStr,
|
||||||
votesStatus, postFilename,
|
title, link,
|
||||||
description, moderated, mirrored)
|
votesStatus, postFilename,
|
||||||
postCtr += 1
|
description, moderated, mirrored)
|
||||||
if postCtr >= maxPostsPerSource:
|
postCtr += 1
|
||||||
break
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' rss 2.0 feed items to newswire')
|
print('Added ' + str(postCtr) + ' rss 2.0 feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
@ -400,16 +409,17 @@ def xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
|
||||||
pubDateStr = parseFeedDate(pubDate)
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
if pubDateStr:
|
if pubDateStr:
|
||||||
postFilename = ''
|
if validFeedDate(pubDateStr):
|
||||||
votesStatus = []
|
postFilename = ''
|
||||||
addNewswireDictEntry(baseDir, domain,
|
votesStatus = []
|
||||||
result, pubDateStr,
|
addNewswireDictEntry(baseDir, domain,
|
||||||
title, link,
|
result, pubDateStr,
|
||||||
votesStatus, postFilename,
|
title, link,
|
||||||
description, moderated, mirrored)
|
votesStatus, postFilename,
|
||||||
postCtr += 1
|
description, moderated, mirrored)
|
||||||
if postCtr >= maxPostsPerSource:
|
postCtr += 1
|
||||||
break
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
|
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
@ -471,16 +481,17 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
|
||||||
pubDateStr = parseFeedDate(pubDate)
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
if pubDateStr:
|
if pubDateStr:
|
||||||
postFilename = ''
|
if validFeedDate(pubDateStr):
|
||||||
votesStatus = []
|
postFilename = ''
|
||||||
addNewswireDictEntry(baseDir, domain,
|
votesStatus = []
|
||||||
result, pubDateStr,
|
addNewswireDictEntry(baseDir, domain,
|
||||||
title, link,
|
result, pubDateStr,
|
||||||
votesStatus, postFilename,
|
title, link,
|
||||||
description, moderated, mirrored)
|
votesStatus, postFilename,
|
||||||
postCtr += 1
|
description, moderated, mirrored)
|
||||||
if postCtr >= maxPostsPerSource:
|
postCtr += 1
|
||||||
break
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' atom feed items to newswire')
|
print('Added ' + str(postCtr) + ' atom feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
@ -540,16 +551,17 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
|
||||||
|
|
||||||
pubDateStr = parseFeedDate(pubDate)
|
pubDateStr = parseFeedDate(pubDate)
|
||||||
if pubDateStr:
|
if pubDateStr:
|
||||||
postFilename = ''
|
if validFeedDate(pubDateStr):
|
||||||
votesStatus = []
|
postFilename = ''
|
||||||
addNewswireDictEntry(baseDir, domain,
|
votesStatus = []
|
||||||
result, pubDateStr,
|
addNewswireDictEntry(baseDir, domain,
|
||||||
title, link,
|
result, pubDateStr,
|
||||||
votesStatus, postFilename,
|
title, link,
|
||||||
description, moderated, mirrored)
|
votesStatus, postFilename,
|
||||||
postCtr += 1
|
description, moderated, mirrored)
|
||||||
if postCtr >= maxPostsPerSource:
|
postCtr += 1
|
||||||
break
|
if postCtr >= maxPostsPerSource:
|
||||||
|
break
|
||||||
if postCtr > 0:
|
if postCtr > 0:
|
||||||
print('Added ' + str(postCtr) + ' YouTube feed items to newswire')
|
print('Added ' + str(postCtr) + ' YouTube feed items to newswire')
|
||||||
return result
|
return result
|
||||||
|
|
23
utils.py
23
utils.py
|
@ -19,6 +19,29 @@ from calendar import monthrange
|
||||||
from followingCalendar import addPersonToCalendar
|
from followingCalendar import addPersonToCalendar
|
||||||
|
|
||||||
|
|
||||||
|
def validPostDate(published: str, maxAgeDays=7) -> bool:
|
||||||
|
"""Returns true if the published date is recent and is not in the future
|
||||||
|
"""
|
||||||
|
baselineTime = datetime.datetime(1970, 1, 1)
|
||||||
|
|
||||||
|
daysDiff = datetime.datetime.utcnow() - baselineTime
|
||||||
|
nowDaysSinceEpoch = daysDiff.days
|
||||||
|
|
||||||
|
postTimeObject = \
|
||||||
|
datetime.datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
daysDiff = postTimeObject - baselineTime
|
||||||
|
postDaysSinceEpoch = daysDiff.days
|
||||||
|
|
||||||
|
if postDaysSinceEpoch > nowDaysSinceEpoch:
|
||||||
|
print("Inbox post has a published date in the future!")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if nowDaysSinceEpoch - postDaysSinceEpoch >= maxAgeDays:
|
||||||
|
print("Inbox post is not recent enough")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def getFullDomain(domain: str, port: int) -> str:
|
def getFullDomain(domain: str, port: int) -> str:
|
||||||
"""Returns the full domain name, including port number
|
"""Returns the full domain name, including port number
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in New Issue