mirror of https://gitlab.com/bashrc2/epicyon
Tidying
parent
eaf57a9781
commit
3b55ef745e
111
crawlers.py
111
crawlers.py
|
@ -229,80 +229,41 @@ def blocked_user_agent(calling_domain: str, agent_str: str,
|
|||
if blocked_ua:
|
||||
print('BLOCK: Blocked User agent 2: ' + agent_domain)
|
||||
|
||||
# optionally block military domains on a per account basis
|
||||
if not blocked_ua and block_military:
|
||||
if '/users/' in path:
|
||||
# which accounts is this?
|
||||
nickname = path.split('/users/')[1]
|
||||
if '/' in nickname:
|
||||
nickname = nickname.split('/')[0]
|
||||
# does this account block military domains?
|
||||
if block_military.get(nickname):
|
||||
mil_domains = get_mil_domains_list()
|
||||
for domain_str in mil_domains:
|
||||
if '.' not in domain_str:
|
||||
tld = domain_str
|
||||
if agent_domain.endswith('.' + tld):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked military tld user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
else:
|
||||
if agent_domain.endswith(domain_str):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked military user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
|
||||
# optionally block government domains on a per account basis
|
||||
if not blocked_ua and block_government:
|
||||
if '/users/' in path:
|
||||
# which accounts is this?
|
||||
nickname = path.split('/users/')[1]
|
||||
if '/' in nickname:
|
||||
nickname = nickname.split('/')[0]
|
||||
# does this account block government domains?
|
||||
if block_government.get(nickname):
|
||||
gov_domains = get_gov_domains_list()
|
||||
for domain_str in gov_domains:
|
||||
if '.' not in domain_str:
|
||||
tld = domain_str
|
||||
if agent_domain.endswith('.' + tld):
|
||||
blocked_ua = True
|
||||
print('BLOCK: ' +
|
||||
'Blocked government tld user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
else:
|
||||
if agent_domain.endswith(domain_str):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked government user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
|
||||
# optionally block bluesky bridges on a per account basis
|
||||
if not blocked_ua and block_bluesky:
|
||||
if '/users/' in path:
|
||||
# which accounts is this?
|
||||
nickname = path.split('/users/')[1]
|
||||
if '/' in nickname:
|
||||
nickname = nickname.split('/')[0]
|
||||
# does this account block bluesky bridges?
|
||||
if block_bluesky.get(nickname):
|
||||
bsky_domains = get_bsky_domains_list()
|
||||
for domain_str in bsky_domains:
|
||||
if '.' not in domain_str:
|
||||
tld = domain_str
|
||||
if agent_domain.endswith('.' + tld):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked bluesky tld user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
else:
|
||||
if agent_domain.endswith(domain_str):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked bluesky user agent: ' +
|
||||
agent_domain)
|
||||
break
|
||||
block_dicts = {
|
||||
"military": block_military,
|
||||
"government": block_government,
|
||||
"bluesky": block_bluesky
|
||||
}
|
||||
for block_type, block_dict in block_dicts.items():
|
||||
if blocked_ua or not block_dict:
|
||||
continue
|
||||
if '/users/' not in path:
|
||||
continue
|
||||
# which accounts is this?
|
||||
nickname = path.split('/users/')[1]
|
||||
if '/' in nickname:
|
||||
nickname = nickname.split('/')[0]
|
||||
# does this account block?
|
||||
if not block_dict.get(nickname):
|
||||
continue
|
||||
if block_type == "military":
|
||||
blk_domains = get_mil_domains_list()
|
||||
elif block_type == "government":
|
||||
blk_domains = get_gov_domains_list()
|
||||
else:
|
||||
blk_domains = get_bsky_domains_list()
|
||||
for domain_str in blk_domains:
|
||||
if '.' not in domain_str:
|
||||
tld = domain_str
|
||||
if agent_domain.endswith('.' + tld):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked ' + block_type +
|
||||
' tld user agent: ' + agent_domain)
|
||||
break
|
||||
elif agent_domain.endswith(domain_str):
|
||||
blocked_ua = True
|
||||
print('BLOCK: Blocked ' + block_type +
|
||||
' user agent: ' + agent_domain)
|
||||
break
|
||||
|
||||
return blocked_ua, blocked_cache_last_updated, False
|
||||
|
|
Loading…
Reference in New Issue