forked from indymedia/epicyon
				
			Support for rss 1.0 feeds
							parent
							
								
									8d29ac3cbe
								
							
						
					
					
						commit
						96c139a63e
					
				
							
								
								
									
										87
									
								
								newswire.py
								
								
								
								
							
							
						
						
									
										87
									
								
								newswire.py
								
								
								
								
							|  | @ -258,7 +258,7 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, | |||
|                   maxPostsPerSource: int, | ||||
|                   maxFeedItemSizeKb: int, | ||||
|                   maxCategoriesFeedItemSizeKb: int) -> {}: | ||||
|     """Converts an xml 2.0 string to a dictionary | ||||
|     """Converts an xml RSS 2.0 string to a dictionary | ||||
|     """ | ||||
|     if '<item>' not in xmlStr: | ||||
|         return {} | ||||
|  | @ -332,6 +332,86 @@ def xml2StrToDict(baseDir: str, domain: str, xmlStr: str, | |||
|     return result | ||||
| 
 | ||||
| 
 | ||||
| def xml1StrToDict(baseDir: str, domain: str, xmlStr: str, | ||||
|                   moderated: bool, mirrored: bool, | ||||
|                   maxPostsPerSource: int, | ||||
|                   maxFeedItemSizeKb: int, | ||||
|                   maxCategoriesFeedItemSizeKb: int) -> {}: | ||||
|     """Converts an xml RSS 1.0 string to a dictionary | ||||
|     https://validator.w3.org/feed/docs/rss1.html | ||||
|     """ | ||||
|     if '<item' not in xmlStr: | ||||
|         return {} | ||||
|     result = {} | ||||
| 
 | ||||
|     # is this an rss feed containing hashtag categories? | ||||
|     if '<title>#categories</title>' in xmlStr: | ||||
|         xml2StrToHashtagCategories(baseDir, xmlStr, | ||||
|                                    maxCategoriesFeedItemSizeKb) | ||||
|         return {} | ||||
| 
 | ||||
|     rssItems = xmlStr.split('<item') | ||||
|     postCtr = 0 | ||||
|     maxBytes = maxFeedItemSizeKb * 1024 | ||||
|     for rssItem in rssItems: | ||||
|         if not rssItem: | ||||
|             continue | ||||
|         if len(rssItem) > maxBytes: | ||||
|             print('WARN: rss feed item is too big') | ||||
|             continue | ||||
|         if '<title>' not in rssItem: | ||||
|             continue | ||||
|         if '</title>' not in rssItem: | ||||
|             continue | ||||
|         if '<link>' not in rssItem: | ||||
|             continue | ||||
|         if '</link>' not in rssItem: | ||||
|             continue | ||||
|         if '<dc:date>' not in rssItem: | ||||
|             continue | ||||
|         if '</dc:date>' not in rssItem: | ||||
|             continue | ||||
|         title = rssItem.split('<title>')[1] | ||||
|         title = removeCDATA(title.split('</title>')[0]) | ||||
|         description = '' | ||||
|         if '<description>' in rssItem and '</description>' in rssItem: | ||||
|             description = rssItem.split('<description>')[1] | ||||
|             description = removeCDATA(description.split('</description>')[0]) | ||||
|         else: | ||||
|             if '<media:description>' in rssItem and \ | ||||
|                '</media:description>' in rssItem: | ||||
|                 description = rssItem.split('<media:description>')[1] | ||||
|                 description = description.split('</media:description>')[0] | ||||
|                 description = removeCDATA(description) | ||||
|         link = rssItem.split('<link>')[1] | ||||
|         link = link.split('</link>')[0] | ||||
|         if '://' not in link: | ||||
|             continue | ||||
|         itemDomain = link.split('://')[1] | ||||
|         if '/' in itemDomain: | ||||
|             itemDomain = itemDomain.split('/')[0] | ||||
|         if isBlockedDomain(baseDir, itemDomain): | ||||
|             continue | ||||
|         pubDate = rssItem.split('<dc:date>')[1] | ||||
|         pubDate = pubDate.split('</dc:date>')[0] | ||||
| 
 | ||||
|         pubDateStr = parseFeedDate(pubDate) | ||||
|         if pubDateStr: | ||||
|             postFilename = '' | ||||
|             votesStatus = [] | ||||
|             addNewswireDictEntry(baseDir, domain, | ||||
|                                  result, pubDateStr, | ||||
|                                  title, link, | ||||
|                                  votesStatus, postFilename, | ||||
|                                  description, moderated, mirrored) | ||||
|             postCtr += 1 | ||||
|             if postCtr >= maxPostsPerSource: | ||||
|                 break | ||||
|     if postCtr > 0: | ||||
|         print('Added ' + str(postCtr) + ' rss 1.0 feed items to newswire') | ||||
|     return result | ||||
| 
 | ||||
| 
 | ||||
| def atomFeedToDict(baseDir: str, domain: str, xmlStr: str, | ||||
|                    moderated: bool, mirrored: bool, | ||||
|                    maxPostsPerSource: int, | ||||
|  | @ -489,6 +569,11 @@ def xmlStrToDict(baseDir: str, domain: str, xmlStr: str, | |||
|                              xmlStr, moderated, mirrored, | ||||
|                              maxPostsPerSource, maxFeedItemSizeKb, | ||||
|                              maxCategoriesFeedItemSizeKb) | ||||
|     elif '<rdf:RDF>' in xmlStr: | ||||
|         return xml1StrToDict(baseDir, domain, | ||||
|                              xmlStr, moderated, mirrored, | ||||
|                              maxPostsPerSource, maxFeedItemSizeKb, | ||||
|                              maxCategoriesFeedItemSizeKb) | ||||
|     elif 'xmlns="http://www.w3.org/2005/Atom"' in xmlStr: | ||||
|         return atomFeedToDict(baseDir, domain, | ||||
|                               xmlStr, moderated, mirrored, | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue