Validate newswire item dates

So they can't be in the future or too far in the past
merge-requests/8/head
Bob Mottram 2020-12-21 12:11:45 +00:00
parent 31a48db4a4
commit 23301b35a5
3 changed files with 76 additions and 63 deletions

View File

@ -10,6 +10,7 @@ import json
import os
import datetime
import time
from utils import validPostDate
from utils import getFullDomain
from utils import isEventPost
from utils import removeIdEnding
@ -71,29 +72,6 @@ from delete import removeOldHashtags
from follow import isFollowingActor
def validPostDate(published: str) -> bool:
"""Returns true if the published date is recent and is not in the future
"""
baselineTime = datetime.datetime(1970, 1, 1)
daysDiff = datetime.datetime.utcnow() - baselineTime
nowDaysSinceEpoch = daysDiff.days
postTimeObject = \
datetime.datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
daysDiff = postTimeObject - baselineTime
postDaysSinceEpoch = daysDiff.days
if postDaysSinceEpoch > nowDaysSinceEpoch:
print("Inbox post has a published date in the future!")
return False
if nowDaysSinceEpoch - postDaysSinceEpoch > 3:
print("Inbox post is not recent enough")
return False
return True
def guessHashtagCategory(tagName: str, hashtagCategories: {}) -> str:
"""Tries to guess a category for the given hashtag.
This works by trying to find the longest similar hashtag

View File

@ -14,6 +14,7 @@ from datetime import datetime
from datetime import timedelta
from datetime import timezone
from collections import OrderedDict
from utils import validPostDate
from utils import setHashtagCategory
from utils import firstParagraphFromString
from utils import isPublicPost
@ -142,6 +143,13 @@ def addNewswireDictEntry(baseDir: str, domain: str,
]
def validFeedDate(pubDate: str) -> bool:
# convert from YY-MM-DD HH:MM:SS+00:00 to
# YY-MM-DDTHH:MM:SSZ
postDate = pubDate.replace(' ', 'T').replace('+00:00', 'Z')
return validPostDate(postDate, 30)
def parseFeedDate(pubDate: str) -> str:
"""Returns a UTC date string based on the given date string
This tries a number of formats to see which work
@ -317,16 +325,17 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if validFeedDate(pubDateStr):
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if postCtr > 0:
print('Added ' + str(postCtr) + ' rss 2.0 feed items to newswire')
return result
@ -400,16 +409,17 @@ def xml1StrToDict(baseDir: str, domain: str, xmlStr: str,
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if validFeedDate(pubDateStr):
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if postCtr > 0:
print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire')
return result
@ -471,16 +481,17 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if validFeedDate(pubDateStr):
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if postCtr > 0:
print('Added ' + str(postCtr) + ' atom feed items to newswire')
return result
@ -540,16 +551,17 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
pubDateStr = parseFeedDate(pubDate)
if pubDateStr:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if validFeedDate(pubDateStr):
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
result, pubDateStr,
title, link,
votesStatus, postFilename,
description, moderated, mirrored)
postCtr += 1
if postCtr >= maxPostsPerSource:
break
if postCtr > 0:
print('Added ' + str(postCtr) + ' YouTube feed items to newswire')
return result

View File

@ -19,6 +19,29 @@ from calendar import monthrange
from followingCalendar import addPersonToCalendar
def validPostDate(published: str, maxAgeDays=7) -> bool:
"""Returns true if the published date is recent and is not in the future
"""
baselineTime = datetime.datetime(1970, 1, 1)
daysDiff = datetime.datetime.utcnow() - baselineTime
nowDaysSinceEpoch = daysDiff.days
postTimeObject = \
datetime.datetime.strptime(published, "%Y-%m-%dT%H:%M:%SZ")
daysDiff = postTimeObject - baselineTime
postDaysSinceEpoch = daysDiff.days
if postDaysSinceEpoch > nowDaysSinceEpoch:
print("Inbox post has a published date in the future!")
return False
if nowDaysSinceEpoch - postDaysSinceEpoch >= maxAgeDays:
print("Inbox post is not recent enough")
return False
return True
def getFullDomain(domain: str, port: int) -> str:
"""Returns the full domain name, including port number
"""