Include hashtags within cw lists

merge-requests/30/head
Bob Mottram 2024-02-23 10:32:46 +00:00
parent 781db96120
commit f32333953c
2 changed files with 154 additions and 114 deletions

View File

@ -31,7 +31,9 @@ def load_cw_lists(base_dir: str, verbose: bool) -> {}:
continue continue
if not list_json.get('name'): if not list_json.get('name'):
continue continue
if not list_json.get('words') and not list_json.get('domains'): if not list_json.get('words') and \
not list_json.get('hashtags') and \
not list_json.get('domains'):
continue continue
name = list_json['name'] name = list_json['name']
if verbose: if verbose:
@ -59,6 +61,12 @@ def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},
languages_understood, "content") languages_understood, "content")
if not content: if not content:
return return
post_tags = []
if post_json_object['object'].get('tag'):
if isinstance(post_json_object['object']['tag'], list):
post_tags = post_json_object['object']['tag']
for name, item in cw_lists.items(): for name, item in cw_lists.items():
if name not in lists_enabled: if name not in lists_enabled:
continue continue
@ -76,6 +84,35 @@ def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {},
matched = False matched = False
# match hashtags within the post
if post_tags and item.get('hashtags'):
for tag in item['hashtags']:
tag = tag.strip()
if not tag:
continue
if not tag.startswith('#'):
tag = '#' + tag
tag = tag.lower()
for tag_dict in post_tags:
if not isinstance(tag_dict, dict):
continue
if not tag_dict.get('Hashtag'):
continue
if not tag_dict.get('name'):
continue
if tag_dict['name'].lower() == tag:
if cw_text:
cw_text = warning + ' / ' + cw_text
else:
cw_text = warning
matched = True
break
if matched:
break
if matched:
continue
# match domains within the content # match domains within the content
if item.get('domains'): if item.get('domains'):
for domain in item['domains']: for domain in item['domains']:

View File

@ -3,119 +3,122 @@
"warning": "Satire", "warning": "Satire",
"description": "Intended to be humorous. Not real news stories.", "description": "Intended to be humorous. Not real news stories.",
"words": [], "words": [],
"hashtags": [
"satire"
],
"domains": [ "domains": [
"alhudood.net", "alhudood.net",
"adobochronicles.com", "adobochronicles.com",
"alternativelyfacts.com", "alternativelyfacts.com",
"alternative-science.com", "alternative-science.com",
"americaslastlineofdefense.com", "americaslastlineofdefense.com",
"babylonbee.com", "babylonbee.com",
"bluenewsnetwork.com", "bluenewsnetwork.com",
"borowitzreport.com", "borowitzreport.com",
"breakingburgh.com", "breakingburgh.com",
"bullshitnews.org", "bullshitnews.org",
"bustatroll.org", "bustatroll.org",
"burrardstreetjournal.com", "burrardstreetjournal.com",
"clickhole.com", "clickhole.com",
"confederacyofdrones.com", "confederacyofdrones.com",
"conservativetears.com", "conservativetears.com",
"cracked.com", "cracked.com",
"dailybonnet.com", "dailybonnet.com",
"dailysquib.co.uk", "dailysquib.co.uk",
"dailyworldupdate.us", "dailyworldupdate.us",
"dailysnark.com", "dailysnark.com",
"der-postillon.com", "der-postillon.com",
"derfmagazine.com", "derfmagazine.com",
"elchiguirebipolar.net", "elchiguirebipolar.net",
"elmundotoday.com", "elmundotoday.com",
"speld.nl", "speld.nl",
"duffelblog.com", "duffelblog.com",
"duhprogressive.com", "duhprogressive.com",
"elkoshary.com", "elkoshary.com",
"empirenews.net", "empirenews.net",
"empiresports.co", "empiresports.co",
"eveningharold.com", "eveningharold.com",
"fark.com", "fark.com",
"fmobserver.com", "fmobserver.com",
"fognews.ru", "fognews.ru",
"frankmag.ca", "frankmag.ca",
"framleyexaminer.com", "framleyexaminer.com",
"freedomcrossroads.com", "freedomcrossroads.com",
"freedomfictions.com", "freedomfictions.com",
"genesiustimes.com", "genesiustimes.com",
"gishgallop.com", "gishgallop.com",
"gomerblog.com", "gomerblog.com",
"harddawn.com", "harddawn.com",
"huzlers.com", "huzlers.com",
"www.imao.us", "www.imao.us",
"infobattle.org", "infobattle.org",
"islamicanews.com", "islamicanews.com",
"chronicle.su", "chronicle.su",
"landoverbaptist.org", "landoverbaptist.org",
"larknews.com", "larknews.com",
"legorafi.fr", "legorafi.fr",
"lercio.it", "lercio.it",
"madhousemagazine.com", "madhousemagazine.com",
"mcsweeneys.net", "mcsweeneys.net",
"moronmajority.com", "moronmajority.com",
"nationalreport.net", "nationalreport.net",
"newsbiscuit.com", "newsbiscuit.com",
"newsmutiny.com", "newsmutiny.com",
"newsthump.com", "newsthump.com",
"npcdaily.com", "npcdaily.com",
"prettycoolsite.com", "prettycoolsite.com",
"private-eye.co.uk", "private-eye.co.uk",
"realnewsrightnow.com", "realnewsrightnow.com",
"realrawnews.com", "realrawnews.com",
"reductress.com", "reductress.com",
"sanctumnews.com", "sanctumnews.com",
"satirev.org", "satirev.org",
"sportspickle.com", "sportspickle.com",
"stiltonsplace.blogspot.com", "stiltonsplace.blogspot.com",
"stubhillnews.com", "stubhillnews.com",
"stuppid.com", "stuppid.com",
"suffolkgazette.com", "suffolkgazette.com",
"sundaysportonline.co.uk", "sundaysportonline.co.uk",
"thatsprettygoodscience.com", "thatsprettygoodscience.com",
"atlbanana.com", "atlbanana.com",
"thebeaverton.com", "thebeaverton.com",
"betootaadvocate.com", "betootaadvocate.com",
"chaser.com.au", "chaser.com.au",
"dailydiscord.com", "dailydiscord.com",
"thedailymash.co.uk", "thedailymash.co.uk",
"halfwaypost.com", "halfwaypost.com",
"thehardtimes.net", "thehardtimes.net",
"humortimes.com", "humortimes.com",
"satirewire.com", "satirewire.com",
"scrappleface.com", "scrappleface.com",
"thelemonpress.co.uk", "thelemonpress.co.uk",
"themideastbeast.com", "themideastbeast.com",
"theneedling.com", "theneedling.com",
"theonion.com", "theonion.com",
"theoxymoron.co.uk", "theoxymoron.co.uk",
"thepeoplescube.com", "thepeoplescube.com",
"thepoke.co.uk", "thepoke.co.uk",
"therightists.com", "therightists.com",
"rochdaleherald.co.uk", "rochdaleherald.co.uk",
"politicalgarbagechute.com", "politicalgarbagechute.com",
"the-postillon.com", "the-postillon.com",
"thecivilian.co.nz", "thecivilian.co.nz",
"thedailyer.com", "thedailyer.com",
"thedailywtf.com", "thedailywtf.com",
"theredshtick.com", "theredshtick.com",
"thesciencepost.com", "thesciencepost.com",
"theshovel.com.au", "theshovel.com.au",
"thespoof.com", "thespoof.com",
"thestonkmarket.com", "thestonkmarket.com",
"thereisnews.com", "thereisnews.com",
"tittletattle365.com", "tittletattle365.com",
"truenorthtimes.ca", "truenorthtimes.ca",
"truthbrary.org", "truthbrary.org",
"walkingeaglenews.com", "walkingeaglenews.com",
"waterfordwhispersnews.com", "waterfordwhispersnews.com",
"weeklyworldnews.com", "weeklyworldnews.com",
"wokennews.com", "wokennews.com",
"worldnewsdailyreport.com", "worldnewsdailyreport.com",
"zaytung.com" "zaytung.com"
] ]
} }