| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | __filename__ = "categories.py" | 
					
						
							|  |  |  | __author__ = "Bob Mottram" | 
					
						
							|  |  |  | __license__ = "AGPL3+" | 
					
						
							| 
									
										
										
										
											2022-02-03 13:58:20 +00:00
										 |  |  | __version__ = "1.3.0" | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | __maintainer__ = "Bob Mottram" | 
					
						
							| 
									
										
										
										
											2021-09-10 16:14:50 +00:00
										 |  |  | __email__ = "bob@libreserver.org" | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | __status__ = "Production" | 
					
						
							| 
									
										
										
										
											2021-06-15 15:08:12 +00:00
										 |  |  | __module_group__ = "RSS Feeds" | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | import os | 
					
						
							|  |  |  | import datetime | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  | MAX_TAG_LENGTH = 42 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | INVALID_HASHTAG_CHARS = (',', ' ', '<', ';', '\\', '"', '&', '#') | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_hashtag_category(base_dir: str, hashtag: str) -> str: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Returns the category for the hashtag
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_filename = base_dir + '/tags/' + hashtag + '.category' | 
					
						
							|  |  |  |     if not os.path.isfile(category_filename): | 
					
						
							|  |  |  |         category_filename = base_dir + '/tags/' + hashtag.title() + '.category' | 
					
						
							|  |  |  |         if not os.path.isfile(category_filename): | 
					
						
							|  |  |  |             category_filename = \ | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |                 base_dir + '/tags/' + hashtag.upper() + '.category' | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             if not os.path.isfile(category_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 return '' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_str = None | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         with open(category_filename, 'r') as category_file: | 
					
						
							|  |  |  |             category_str = category_file.read() | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         print('EX: unable to read category ' + category_filename) | 
					
						
							|  |  |  |     if category_str: | 
					
						
							|  |  |  |         return category_str | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     return '' | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def get_hashtag_categories(base_dir: str, | 
					
						
							|  |  |  |                            recent: bool = False, | 
					
						
							|  |  |  |                            category: str = None) -> None: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Returns a dictionary containing hashtag categories
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     hashtag_categories = {} | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     if recent: | 
					
						
							| 
									
										
										
										
											2021-12-26 13:17:46 +00:00
										 |  |  |         curr_time = datetime.datetime.utcnow() | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         days_since_epoch = (curr_time - datetime.datetime(1970, 1, 1)).days | 
					
						
							|  |  |  |         recently = days_since_epoch - 1 | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |     for subdir, dirs, files in os.walk(base_dir + '/tags'): | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         for catfile in files: | 
					
						
							|  |  |  |             if not catfile.endswith('.category'): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             category_filename = os.path.join(base_dir + '/tags', catfile) | 
					
						
							|  |  |  |             if not os.path.isfile(category_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             hashtag = catfile.split('.')[0] | 
					
						
							|  |  |  |             if len(hashtag) > MAX_TAG_LENGTH: | 
					
						
							| 
									
										
										
										
											2021-01-24 10:45:35 +00:00
										 |  |  |                 continue | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             with open(category_filename, 'r') as fp_category: | 
					
						
							|  |  |  |                 category_str = fp_category.read() | 
					
						
							| 
									
										
										
										
											2021-06-21 22:52:04 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                 if not category_str: | 
					
						
							| 
									
										
										
										
											2021-06-21 22:52:04 +00:00
										 |  |  |                     continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 if category: | 
					
						
							|  |  |  |                     # only return a dictionary for a specific category | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                     if category_str != category: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                         continue | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |                 if recent: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                     tags_filename = base_dir + '/tags/' + hashtag + '.txt' | 
					
						
							|  |  |  |                     if not os.path.isfile(tags_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                         continue | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                     mod_time_since_epoc = \ | 
					
						
							|  |  |  |                         os.path.getmtime(tags_filename) | 
					
						
							|  |  |  |                     last_modified_date = \ | 
					
						
							|  |  |  |                         datetime.datetime.fromtimestamp(mod_time_since_epoc) | 
					
						
							|  |  |  |                     file_days_since_epoch = \ | 
					
						
							|  |  |  |                         (last_modified_date - | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                          datetime.datetime(1970, 1, 1)).days | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                     if file_days_since_epoch < recently: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                         continue | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                 if not hashtag_categories.get(category_str): | 
					
						
							|  |  |  |                     hashtag_categories[category_str] = [hashtag] | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 else: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                     if hashtag not in hashtag_categories[category_str]: | 
					
						
							|  |  |  |                         hashtag_categories[category_str].append(hashtag) | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |         break | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     return hashtag_categories | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def update_hashtag_categories(base_dir: str) -> None: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Regenerates the list of hashtag categories
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_list_filename = base_dir + '/accounts/categoryList.txt' | 
					
						
							|  |  |  |     hashtag_categories = get_hashtag_categories(base_dir) | 
					
						
							|  |  |  |     if not hashtag_categories: | 
					
						
							|  |  |  |         if os.path.isfile(category_list_filename): | 
					
						
							| 
									
										
										
										
											2021-09-05 10:17:43 +00:00
										 |  |  |             try: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                 os.remove(category_list_filename) | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |             except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |                 print('EX: update_hashtag_categories ' + | 
					
						
							| 
									
										
										
										
											2021-10-29 16:31:20 +00:00
										 |  |  |                       'unable to delete cached category list ' + | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                       category_list_filename) | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |         return | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_list = [] | 
					
						
							|  |  |  |     for category_str, _ in hashtag_categories.items(): | 
					
						
							|  |  |  |         category_list.append(category_str) | 
					
						
							|  |  |  |     category_list.sort() | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_list_str = '' | 
					
						
							|  |  |  |     for category_str in category_list: | 
					
						
							|  |  |  |         category_list_str += category_str + '\n' | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  |     # save a list of available categories for quick lookup | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         with open(category_list_filename, 'w+') as fp_category: | 
					
						
							|  |  |  |             fp_category.write(category_list_str) | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |     except OSError: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         print('EX: unable to write category ' + category_list_filename) | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def _valid_hashtag_category(category: str) -> bool: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Returns true if the category name is valid
 | 
					
						
							|  |  |  |     """
 | 
					
						
							|  |  |  |     if not category: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     for char in INVALID_HASHTAG_CHARS: | 
					
						
							|  |  |  |         if char in category: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |             return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # too long | 
					
						
							|  |  |  |     if len(category) > 40: | 
					
						
							|  |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     return True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  | def set_hashtag_category(base_dir: str, hashtag: str, category: str, | 
					
						
							|  |  |  |                          update: bool, force: bool = False) -> bool: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Sets the category for the hashtag
 | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |     if not _valid_hashtag_category(category): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |         return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not force: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         hashtag_filename = base_dir + '/tags/' + hashtag + '.txt' | 
					
						
							|  |  |  |         if not os.path.isfile(hashtag_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |             hashtag = hashtag.title() | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             hashtag_filename = base_dir + '/tags/' + hashtag + '.txt' | 
					
						
							|  |  |  |             if not os.path.isfile(hashtag_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 hashtag = hashtag.upper() | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                 hashtag_filename = base_dir + '/tags/' + hashtag + '.txt' | 
					
						
							|  |  |  |                 if not os.path.isfile(hashtag_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                     return False | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-25 16:17:53 +00:00
										 |  |  |     if not os.path.isdir(base_dir + '/tags'): | 
					
						
							|  |  |  |         os.mkdir(base_dir + '/tags') | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_filename = base_dir + '/tags/' + hashtag + '.category' | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     if force: | 
					
						
							|  |  |  |         # don't overwrite any existing categories | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         if os.path.isfile(category_filename): | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |             return False | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_written = False | 
					
						
							| 
									
										
										
										
											2021-11-25 18:42:38 +00:00
										 |  |  |     try: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         with open(category_filename, 'w+') as fp_category: | 
					
						
							|  |  |  |             fp_category.write(category) | 
					
						
							|  |  |  |             category_written = True | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |     except OSError as ex: | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |         print('EX: unable to write category ' + category_filename + | 
					
						
							| 
									
										
										
										
											2021-12-25 15:28:52 +00:00
										 |  |  |               ' ' + str(ex)) | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     if category_written: | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  |         if update: | 
					
						
							| 
									
										
										
										
											2021-12-29 21:55:09 +00:00
										 |  |  |             update_hashtag_categories(base_dir) | 
					
						
							| 
									
										
										
										
											2021-11-26 12:28:20 +00:00
										 |  |  |         return True | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     return False | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  | def guess_hashtag_category(tagName: str, hashtag_categories: {}) -> str: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |     """Tries to guess a category for the given hashtag.
 | 
					
						
							|  |  |  |     This works by trying to find the longest similar hashtag | 
					
						
							|  |  |  |     """
 | 
					
						
							| 
									
										
										
										
											2021-07-13 08:43:07 +00:00
										 |  |  |     if len(tagName) < 4: | 
					
						
							|  |  |  |         return '' | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     category_matched = '' | 
					
						
							|  |  |  |     tag_matched_len = 0 | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     for category_str, hashtag_list in hashtag_categories.items(): | 
					
						
							|  |  |  |         for hashtag in hashtag_list: | 
					
						
							| 
									
										
										
										
											2021-07-13 08:35:29 +00:00
										 |  |  |             if len(hashtag) < 4: | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |                 # avoid matching very small strings which often | 
					
						
							|  |  |  |                 # lead to spurious categories | 
					
						
							|  |  |  |                 continue | 
					
						
							|  |  |  |             if hashtag not in tagName: | 
					
						
							|  |  |  |                 if tagName not in hashtag: | 
					
						
							|  |  |  |                     continue | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |             if not category_matched: | 
					
						
							|  |  |  |                 tag_matched_len = len(hashtag) | 
					
						
							|  |  |  |                 category_matched = category_str | 
					
						
							| 
									
										
										
										
											2020-12-22 10:30:52 +00:00
										 |  |  |             else: | 
					
						
							|  |  |  |                 # match the longest tag | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |                 if len(hashtag) > tag_matched_len: | 
					
						
							|  |  |  |                     category_matched = category_str | 
					
						
							|  |  |  |     if not category_matched: | 
					
						
							| 
									
										
										
										
											2021-07-13 08:43:07 +00:00
										 |  |  |         return '' | 
					
						
							| 
									
										
										
										
											2021-12-30 18:38:36 +00:00
										 |  |  |     return category_matched |