| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  | __filename__ = "cwlists.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2024-01-21 19:01:20 +00:00
										 |  |  | __version__ = "1.5.0" | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							|  |  |  | __email__ = "bob@libreserver.org" | 
					
						
							|  |  |  | __status__ = "Production" | 
					
						
							|  |  |  | __module_group__ = "Core" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | from utils import load_json | 
					
						
							|  |  |  | from utils import get_content_from_post | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def load_cw_lists(base_dir: str, verbose: bool) -> {}: | 
					
						
							|  |  |  |     """Load lists used for content warnings
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not os.path.isdir(base_dir + '/cwlists'): | 
					
						
							|  |  |  |         return {} | 
					
						
							|  |  |  |     result = {} | 
					
						
							|  |  |  |     # NOTE: here we do want to allow recursive walk through | 
					
						
							|  |  |  |     # possible subdirectories | 
					
						
							|  |  |  |     for _, _, files in os.walk(base_dir + '/cwlists'): | 
					
						
							|  |  |  |         for fname in files: | 
					
						
							|  |  |  |             if not fname.endswith('.json'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             list_filename = os.path.join(base_dir + '/cwlists', fname) | 
					
						
							|  |  |  |             print('list_filename: ' + list_filename) | 
					
						
							| 
									
										
										
										
											2024-06-20 10:47:58 +00:00
										 |  |  |             list_json = load_json(list_filename) | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  |             if not list_json: | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if not list_json.get('name'): | 
					
						
							|  |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2024-02-23 10:32:46 +00:00
										 |  |  |             if not list_json.get('words') and \ | 
					
						
							|  |  |  |                not list_json.get('hashtags') and \ | 
					
						
							|  |  |  |                not list_json.get('domains'): | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  |                 continue | 
					
						
							|  |  |  |             name = list_json['name'] | 
					
						
							|  |  |  |             if verbose: | 
					
						
							|  |  |  |                 print('List: ' + name) | 
					
						
							|  |  |  |             result[name] = list_json | 
					
						
							|  |  |  |     return result | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-08 10:22:38 +00:00
										 |  |  | def _add_cw_match_tags(item: {}, post_tags: {}, cw_text: str, | 
					
						
							|  |  |  |                        warning: str) -> (bool, str): | 
					
						
							|  |  |  |     """Updates content warning text using hashtags from within
 | 
					
						
							|  |  |  |     the post content | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     matched = False | 
					
						
							|  |  |  |     for tag in item['hashtags']: | 
					
						
							|  |  |  |         tag = tag.strip() | 
					
						
							|  |  |  |         if not tag: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not tag.startswith('#'): | 
					
						
							|  |  |  |             tag = '#' + tag | 
					
						
							|  |  |  |         tag = tag.lower() | 
					
						
							|  |  |  |         for tag_dict in post_tags: | 
					
						
							|  |  |  |             if not isinstance(tag_dict, dict): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if not tag_dict.get('Hashtag'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if not tag_dict.get('name'): | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if tag_dict['name'].lower() == tag: | 
					
						
							|  |  |  |                 if cw_text: | 
					
						
							|  |  |  |                     cw_text = warning + ' / ' + cw_text | 
					
						
							|  |  |  |                 else: | 
					
						
							|  |  |  |                     cw_text = warning | 
					
						
							|  |  |  |                 matched = True | 
					
						
							|  |  |  |                 break | 
					
						
							|  |  |  |         if matched: | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |     return matched, cw_text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _add_cw_match_domains(item: {}, content: str, cw_text: str, | 
					
						
							|  |  |  |                           warning: str) -> (bool, str): | 
					
						
							|  |  |  |     """Updates content warning text using domains from within
 | 
					
						
							|  |  |  |     the post content | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     matched = False | 
					
						
							|  |  |  |     for domain in item['domains']: | 
					
						
							|  |  |  |         if '.' in domain: | 
					
						
							|  |  |  |             first_section = domain.split('.')[0] | 
					
						
							|  |  |  |             if len(first_section) < 4: | 
					
						
							|  |  |  |                 if '.' + domain in content or \ | 
					
						
							|  |  |  |                    '/' + domain in content: | 
					
						
							|  |  |  |                     if cw_text: | 
					
						
							|  |  |  |                         cw_text = warning + ' / ' + cw_text | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         cw_text = warning | 
					
						
							|  |  |  |                     matched = True | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         if domain in content: | 
					
						
							|  |  |  |             if cw_text: | 
					
						
							|  |  |  |                 cw_text = warning + ' / ' + cw_text | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 cw_text = warning | 
					
						
							|  |  |  |             matched = True | 
					
						
							|  |  |  |             break | 
					
						
							|  |  |  |     return matched, cw_text | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  | def add_cw_from_lists(post_json_object: {}, cw_lists: {}, translate: {}, | 
					
						
							|  |  |  |                       lists_enabled: str, system_language: str, | 
					
						
							|  |  |  |                       languages_understood: []) -> None: | 
					
						
							|  |  |  |     """Adds content warnings by matching the post content
 | 
					
						
							|  |  |  |     against domains or keywords | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not lists_enabled: | 
					
						
							|  |  |  |         return | 
					
						
							|  |  |  |     if 'content' not in post_json_object['object']: | 
					
						
							|  |  |  |         if 'contentMap' not in post_json_object['object']: | 
					
						
							|  |  |  |             return | 
					
						
							|  |  |  |     cw_text = '' | 
					
						
							|  |  |  |     if post_json_object['object'].get('summary'): | 
					
						
							|  |  |  |         cw_text = post_json_object['object']['summary'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     content = get_content_from_post(post_json_object, system_language, | 
					
						
							|  |  |  |                                     languages_understood, "content") | 
					
						
							|  |  |  |     if not content: | 
					
						
							|  |  |  |         return | 
					
						
							| 
									
										
										
										
											2024-02-23 10:32:46 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     post_tags = [] | 
					
						
							|  |  |  |     if post_json_object['object'].get('tag'): | 
					
						
							|  |  |  |         if isinstance(post_json_object['object']['tag'], list): | 
					
						
							|  |  |  |             post_tags = post_json_object['object']['tag'] | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  |     for name, item in cw_lists.items(): | 
					
						
							|  |  |  |         if name not in lists_enabled: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         if not item.get('warning'): | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  |         warning = item['warning'] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # is there a translated version of the warning? | 
					
						
							|  |  |  |         if translate.get(warning): | 
					
						
							|  |  |  |             warning = translate[warning] | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # is the warning already in the CW? | 
					
						
							|  |  |  |         if warning in cw_text: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         matched = False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-02-23 10:32:46 +00:00
										 |  |  |         # match hashtags within the post | 
					
						
							|  |  |  |         if post_tags and item.get('hashtags'): | 
					
						
							| 
									
										
										
										
											2024-05-08 10:22:38 +00:00
										 |  |  |             matched, cw_text = \ | 
					
						
							|  |  |  |                 _add_cw_match_tags(item, post_tags, cw_text, warning) | 
					
						
							| 
									
										
										
										
											2024-02-23 10:32:46 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if matched: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  |         # match domains within the content | 
					
						
							|  |  |  |         if item.get('domains'): | 
					
						
							| 
									
										
										
										
											2024-05-08 10:22:38 +00:00
										 |  |  |             matched, cw_text = \ | 
					
						
							|  |  |  |                 _add_cw_match_domains(item, content, cw_text, warning) | 
					
						
							| 
									
										
										
										
											2023-03-20 14:50:19 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |         if matched: | 
					
						
							|  |  |  |             continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         # match words within the content | 
					
						
							|  |  |  |         if item.get('words'): | 
					
						
							|  |  |  |             for word_str in item['words']: | 
					
						
							|  |  |  |                 if word_str in content or word_str.title() in content: | 
					
						
							|  |  |  |                     if cw_text: | 
					
						
							|  |  |  |                         cw_text = warning + ' / ' + cw_text | 
					
						
							|  |  |  |                     else: | 
					
						
							|  |  |  |                         cw_text = warning | 
					
						
							|  |  |  |                     break | 
					
						
							|  |  |  |     if cw_text: | 
					
						
							|  |  |  |         post_json_object['object']['summary'] = cw_text | 
					
						
							|  |  |  |         post_json_object['object']['sensitive'] = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def get_cw_list_variable(list_name: str) -> str: | 
					
						
							|  |  |  |     """Returns the variable associated with a CW list
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     return 'list' + list_name.replace(' ', '').replace("'", '') |