diff --git a/newswire.py b/newswire.py
index 1e5e9998..2842ba11 100644
--- a/newswire.py
+++ b/newswire.py
@@ -136,6 +136,59 @@ def addNewswireDictEntry(baseDir: str, domain: str,
]
+def parseFeedDate(pubDate: str):
+ """Returns a date object based on the given date string
+ This tries a number of formats to see which work
+ """
+ formats = ("%a, %d %b %Y %H:%M:%S %z",
+ "%a, %d %b %Y %H:%M:%S EST",
+ "%a, %d %b %Y %H:%M:%S UT",
+ "%Y-%m-%dT%H:%M:%SZ",
+ "%Y-%m-%dT%H:%M:%S%z")
+
+ publishedDate = None
+ for dateFormat in formats:
+ if ',' in pubDate and ',' not in dateFormat:
+ continue
+ if ',' not in pubDate and ',' in dateFormat:
+ continue
+ if '-' in pubDate and '-' not in dateFormat:
+ continue
+ if '-' not in pubDate and '-' in dateFormat:
+ continue
+ if 'T' in pubDate and 'T' not in dateFormat:
+ continue
+ if 'T' not in pubDate and 'T' in dateFormat:
+ continue
+ if 'Z' in pubDate and 'Z' not in dateFormat:
+ continue
+ if 'Z' not in pubDate and 'Z' in dateFormat:
+ continue
+ if 'EST' not in pubDate and 'EST' in dateFormat:
+ continue
+ if 'EST' in pubDate and 'EST' not in dateFormat:
+ continue
+ if 'UT' not in pubDate and 'UT' in dateFormat:
+ continue
+ if 'UT' in pubDate and 'UT' not in dateFormat:
+ continue
+
+ try:
+ publishedDate = \
+ datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST")
+ except BaseException:
+ print('WARN: unrecognized date format: ' +
+ pubDate + ' ' + dateFormat)
+ continue
+
+ if publishedDate:
+ if pubDate.endswith(' EST'):
+ hoursAdded = timedelta(hours=5)
+ publishedDate = publishedDate + hoursAdded
+ break
+ return publishedDate
+
+
def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
moderated: bool, mirrored: bool,
maxPostsPerSource: int,
@@ -187,10 +240,9 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
continue
pubDate = rssItem.split('')[1]
pubDate = pubDate.split('')[0]
- parsed = False
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S %z")
+
+ publishedDate = parseFeedDate(pubDate)
+ if publishedDate:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
@@ -201,51 +253,6 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str,
postCtr += 1
if postCtr >= maxPostsPerSource:
break
- parsed = True
- except BaseException:
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST")
- hoursAdded = timedelta(hours=5)
- publishedDate = publishedDate + hoursAdded
- postFilename = ''
- votesStatus = []
- pubDateStr = str(publishedDate) + '+00:00'
- addNewswireDictEntry(baseDir, domain,
- result, pubDateStr,
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized RSS date format EST: ' + pubDate)
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
- postFilename = ''
- votesStatus = []
- addNewswireDictEntry(baseDir, domain,
- result,
- str(publishedDate) + '+00:00',
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized RSS date format UT: ' + pubDate)
- pass
return result
@@ -300,10 +307,9 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
continue
pubDate = atomItem.split('')[1]
pubDate = pubDate.split('')[0]
- parsed = False
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ")
+
+ publishedDate = parseFeedDate(pubDate)
+ if publishedDate:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
@@ -314,51 +320,6 @@ def atomFeedToDict(baseDir: str, domain: str, xmlStr: str,
postCtr += 1
if postCtr >= maxPostsPerSource:
break
- parsed = True
- except BaseException:
- print('WARN: unrecognized atom date format UT: ' + pubDate)
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z")
- postFilename = ''
- votesStatus = []
- addNewswireDictEntry(baseDir, domain, result,
- str(publishedDate),
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized atom feed date format z: ' + pubDate)
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S EST")
- hoursAdded = timedelta(hours=5)
- publishedDate = publishedDate + hoursAdded
- postFilename = ''
- votesStatus = []
- pubDateStr = str(publishedDate) + '+00:00'
- addNewswireDictEntry(baseDir, domain,
- result, pubDateStr,
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized atom date format EST: ' + pubDate)
- pass
return result
@@ -410,10 +371,9 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
link = 'https://www.youtube.com/watch?v=' + link.strip()
pubDate = atomItem.split('')[1]
pubDate = pubDate.split('')[0]
- parsed = False
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%SZ")
+
+ publishedDate = parseFeedDate(pubDate)
+ if publishedDate:
postFilename = ''
votesStatus = []
addNewswireDictEntry(baseDir, domain,
@@ -424,50 +384,6 @@ def atomFeedYTToDict(baseDir: str, domain: str, xmlStr: str,
postCtr += 1
if postCtr >= maxPostsPerSource:
break
- parsed = True
- except BaseException:
- print('WARN: unrecognized YT atom date format UT: ' + pubDate)
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%Y-%m-%dT%H:%M:%S%z")
- postFilename = ''
- votesStatus = []
- addNewswireDictEntry(baseDir, domain, result,
- str(publishedDate),
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized YT atom feed date format z: ' +
- pubDate)
- pass
-
- if not parsed:
- try:
- publishedDate = \
- datetime.strptime(pubDate, "%a, %d %b %Y %H:%M:%S UT")
- postFilename = ''
- votesStatus = []
- addNewswireDictEntry(baseDir, domain, result,
- str(publishedDate) + '+00:00',
- title, link,
- votesStatus, postFilename,
- description, moderated, mirrored)
- postCtr += 1
- if postCtr >= maxPostsPerSource:
- break
- parsed = True
- except BaseException:
- print('WARN: unrecognized YT atom feed date format UT: ' +
- pubDate)
- pass
return result