Remove control characters

main
Bob Mottram 2020-10-10 09:54:13 +01:00
parent acc76cdcd6
commit 81cc189755
1 changed files with 6 additions and 3 deletions

View File

@ -9,7 +9,7 @@ __status__ = "Production"
import os import os
import time import time
import datetime import datetime
import urllib.parse import unicodedata
from collections import OrderedDict from collections import OrderedDict
from newswire import getDictFromNewswire from newswire import getDictFromNewswire
from posts import createNewsPost from posts import createNewsPost
@ -51,6 +51,9 @@ def saveArrivedTime(baseDir: str, postFilename: str, arrived: str) -> None:
arrivedFile.close() arrivedFile.close()
def removeControlCharacters(content: str):
return "".join(ch for ch in content if unicodedata.category(ch)[0]!="C")
def convertRSStoActivityPub(baseDir: str, httpPrefix: str, def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
domain: str, port: int, domain: str, port: int,
newswire: {}, newswire: {},
@ -90,8 +93,8 @@ def convertRSStoActivityPub(baseDir: str, httpPrefix: str,
newswire[originalDateStr][3] = filename newswire[originalDateStr][3] = filename
continue continue
rssTitle = urllib.parse.unquote_plus(item[0]) rssTitle = removeControlCharacters(item[0])
url = urllib.parse.unquote_plus(item[1]) url = removeControlCharacters(item[1])
rssDescription = '' rssDescription = ''
# get the rss description if it exists # get the rss description if it exists