From ac3b417b622ccef68e86ff8083562c0fa6e945c2 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Tue, 12 Dec 2023 18:30:31 +0000 Subject: [PATCH] Content warnings for military-industrial usa --- blocking.py | 19 ++++-- crawlers.py | 20 ++++-- cwlists/military_us.json | 139 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 12 deletions(-) create mode 100644 cwlists/military_us.json diff --git a/blocking.py b/blocking.py index c557178c4..2cfa750aa 100644 --- a/blocking.py +++ b/blocking.py @@ -1791,17 +1791,24 @@ def save_blocked_military(base_dir: str, block_military: {}) -> None: def get_mil_domains_list() -> []: - """returns a list of military top level domains + """returns a list of military domains """ - return ('army', 'navy', 'airforce', 'mil') + return ('army', 'navy', 'airforce', 'mil', + 'sncorp.com', 'sierranevadacorp.us', 'ncontext.com') def contains_military_domain(message_str: str) -> bool: """Returns true if the given string contains a military domain """ mil_domains = get_mil_domains_list() - for tld in mil_domains: - if '.' + tld + '"' in message_str or \ - '.' + tld + '/' in message_str: - return True + for domain_str in mil_domains: + if '.' not in domain_str: + tld = domain_str + if '.' + tld + '"' in message_str or \ + '.' + tld + '/' in message_str: + return True + else: + if domain_str + '"' in message_str or \ + domain_str + '/' in message_str: + return True return False diff --git a/crawlers.py b/crawlers.py index 2f82c1cf5..5aec5e51d 100644 --- a/crawlers.py +++ b/crawlers.py @@ -192,11 +192,19 @@ def blocked_user_agent(calling_domain: str, agent_str: str, # does this account block military domains? if block_military.get(nickname): mil_domains = get_mil_domains_list() - for tld in mil_domains: - if agent_domain.endswith('.' + tld): - blocked_ua = True - print('BLOCK: Blocked military user agent: ' + - agent_domain) - break + for domain_str in mil_domains: + if '.' not in domain_str: + tld = domain_str + if agent_domain.endswith('.' + tld): + blocked_ua = True + print('BLOCK: Blocked military tld user agent: ' + + agent_domain) + break + else: + if agent_domain.endswith(domain_str): + blocked_ua = True + print('BLOCK: Blocked military user agent: ' + + agent_domain) + break return blocked_ua, blocked_cache_last_updated diff --git a/cwlists/military_us.json b/cwlists/military_us.json new file mode 100644 index 000000000..ad39a8746 --- /dev/null +++ b/cwlists/military_us.json @@ -0,0 +1,139 @@ +{ + "name": "Military-industrial complex (USA)", + "warning": "Military-industrial complex (USA)", + "description": "Military contractors in the USA", + "words": [], + "domains": [ + "constellis.com", + "actiontarget.com", + "adt.com", + "advanced-armament.com", + "aecom.com", + "aerospace.org", + "avinc.com", + "amgeneral.com", + "api.org", + "argonst.com", + "rockwellcollins.com", + "artisllc.com", + "assettinc.com", + "astronautics.com", + "aurora.aero", + "axon.com", + "baesystems.com", + "ball.com", + "barrett.net", + "battelle.org", + "bechtel.com", + "boeing.com", + "boozallen.com", + "bostondynamics.com", + "caci.com", + "carlyle.com", + "cmu.edu", + "ceradyne.com", + "cloudera.com", + "columbiagroup.com", + "csra.com", + "cubic.com", + "omegatraining.com", + "curtisswright.com", + "decibelresearch.com", + "draper.com", + "leonardodrs.com", + "dyn-intl.com", + "ewi.org", + "elbitsystems.com", + "ensco.com", + "ey.com", + "evergreenaviation.com", + "exxonmobil.com", + "fluor.com", + "gdls.com", + "qinetiq-na.com", + "fwc.com", + "ga.com", + "gd.com", + "gdbiw.com", + "gdeb.com", + "gulfstream.com", + "ge.com", + "halliburton.com", + "healthnet.com", + "honeywell.com", + "humana.com", + "hii.com", + "hybricon.com", + "ibm.com", + "insighttechnology.com", + "intelsat.com", + "irobot.com", + "exelisinc.com", + "jacobs.com", + "jhu.edu", + "kaman.com", + "kbr.com", + "kearfott.com", + "knightarmco.com", + "kratosdefense.com", + "l3harris.com", + "rocket.com", + "leidos.com", + "eotechinc.com", + "lmtdefense.com", + "lockheedmartin.com", + "gyrocamsystems.com", + "sikorsky.com", + "genasys.com", + "mantech.com", + "maxar.com", + "mcqinc.com", + "microsoft.com", + "missionessential.com", + "motorola.com", + "natelems.com", + "navistar.com", + "nextel.com", + "northropgrumman.com", + "oceaneering.com", + "olin.com", + "oshkoshcorp.com", + "para-usa.com", + "perotsystems.com", + "army.mil", + "precast.com", + "rtx.com", + "collinsaerospace.com", + "rockwellcollins.com", + "goodrich.com", + "prattwhitney.com", + "raytheonintelligenceandspace.com", + "raytheonmissilesanddefense.com", + "remington.com", + "remarms.com", + "ruger.com", + "saab.com", + "saic.com", + "govcio.com", + "sncorp.com", + "smith-wesson.com", + "smithenterprise.com", + "sparta.com", + "nps.gov", + "srcinc.com", + "sri.com", + "stewartandstevenson.com", + "swiftengineering.com", + "tacticalairsupport.com", + "teledyne.com", + "textron.com", + "aaicorp.com", + "bellflight.com", + "trijicon.com", + "triwest.com", + "unisys.com", + "usord.com", + "verizon.com", + "vinnellarabia.com" + ] +}