From dd55ad92e6fef9b4ece185ddea262cba6becb39f Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Wed, 29 Nov 2023 11:37:44 +0000 Subject: [PATCH] Remove zero length strings from feeds --- newswire.py | 2 ++ utils.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/newswire.py b/newswire.py index 2e1c48c5a..34dcac4b1 100644 --- a/newswire.py +++ b/newswire.py @@ -19,6 +19,7 @@ from datetime import timezone from collections import OrderedDict from utils import valid_post_date from categories import set_hashtag_category +from utils import remove_zero_length_strings from utils import date_from_string_format from utils import acct_handle_dir from utils import remove_eol @@ -1373,6 +1374,7 @@ def get_rss(base_dir: str, domain: str, session, url: str, timeout=timeout_sec, allow_redirects=True) if result: + result.text = remove_zero_length_strings(result.text) if int(len(result.text) / 1024) >= max_feed_size_kb: print('WARN: feed is too large: ' + url) elif not contains_invalid_chars(result.text): diff --git a/utils.py b/utils.py index bb90553a7..705168ab5 100644 --- a/utils.py +++ b/utils.py @@ -46,6 +46,12 @@ INVALID_ACTOR_URL_CHARACTERS = ( ) +def remove_zero_length_strings(text: str) -> str: + """removes zero length strings from text + """ + return text.replace('​', '') + + def _utc_mktime(utc_tuple): """Returns number of seconds elapsed since epoch Note that no timezone are taken into consideration.