diff --git a/content.py b/content.py index 042827fb1..c80dd4229 100644 --- a/content.py +++ b/content.py @@ -585,6 +585,73 @@ def _shorten_linked_urls(content: str) -> str: return content +def _contains_doi_reference(wrd: str, replace_dict: {}) -> bool: + """Handle DOI scientific references + """ + if not wrd.startswith('doi:') and \ + not wrd.startswith('DOI:'): + return False + + doi_ref_str = wrd.split(':', 1)[1] + doi_site = 'https://sci-hub.ru' + markup = '' + \ + 'doi:' + doi_ref_str + \ + '' + replace_dict[wrd] = markup + return True + + +def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool: + """Handle arxiv scientific references + """ + if not wrd.startswith('arXiv:') and \ + not wrd.startswith('arx:') and \ + not wrd.startswith('arxiv:'): + return False + + arxiv_ref_str = wrd.split(':', 1)[1].lower() + if '.' in arxiv_ref_str: + arxiv_ref = arxiv_ref_str.split('.') + elif ':' in arxiv_ref_str: + arxiv_ref = arxiv_ref_str.split(':') + else: + return False + if len(arxiv_ref) != 2: + return False + if not arxiv_ref[0].isdigit(): + return False + arxiv_day = arxiv_ref[1] + if 'v' in arxiv_day: + arxiv_day = arxiv_day.split('v')[0] + if not arxiv_day.isdigit(): + return False + ref_str = arxiv_ref[0] + '.' + arxiv_ref[1] + markup = '' + \ + 'arXiv:' + ref_str + \ + '' + replace_dict[wrd] = markup + return True + + +def _contains_academic_references(content: str) -> bool: + """Does the given content contain academic references + """ + prefixes = ( + 'arXiv:', 'arx:', 'arxiv:', + 'doi:', 'DOI:' + ) + for reference in prefixes: + if reference in content: + return True + return False + + def add_web_links(content: str) -> str: """Adds markup for web links """ @@ -604,7 +671,7 @@ def add_web_links(content: str) -> str: # if there are no prefixes then just keep the content we have if not prefix_found: - if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content: + if _contains_academic_references(content): prefix_found = True else: return content @@ -615,30 +682,9 @@ def add_web_links(content: str) -> str: for wrd in words: if ':' not in wrd: continue - # handle arxiv scientific references - if wrd.startswith('arXiv:') or \ - wrd.startswith('arx:') or \ - wrd.startswith('arxiv:'): - arxiv_ref_str = wrd.split(':', 1)[1].lower() - if '.' in arxiv_ref_str: - arxiv_ref = arxiv_ref_str.split('.') - elif ':' in arxiv_ref_str: - arxiv_ref = arxiv_ref_str.split(':') - else: - continue - if len(arxiv_ref) == 2: - arxiv_day = arxiv_ref[1] - if 'v' in arxiv_day: - arxiv_day = arxiv_day.split('v')[0] - if arxiv_ref[0].isdigit() and arxiv_day.isdigit(): - ref_str = arxiv_ref[0] + '.' + arxiv_ref[1] - markup = '' + \ - 'arXiv:' + ref_str + \ - '' - replace_dict[wrd] = markup + if _contains_arxiv_reference(wrd, replace_dict): + continue + if _contains_doi_reference(wrd, replace_dict): continue # does the word begin with a prefix? prefix_found = False diff --git a/tests.py b/tests.py index 2bacd9d6b..c99597fee 100644 --- a/tests.py +++ b/tests.py @@ -3623,6 +3623,11 @@ def _test_web_links(): print(linked_text) assert expected_text in linked_text + # NOTE: it is difficult to find academic studies of the fediverse which + # do not in some way violate consent or embody an arrogant status + # quo attitude. Did all those scraped accounts agree to be part of + # an academic study? Did they even consider consent as an issue? + # It seems doubtful. We are just like algae under a microscope to them. example_text = \ 'This post has an arxiv link arXiv:2203.15752 some other text' linked_text = add_web_links(example_text) @@ -3635,6 +3640,20 @@ def _test_web_links(): print(linked_text) assert expected_text in linked_text + example_text = \ + 'This post has an doi link ' + \ + 'doi:10.1109/INFCOMW.2019.8845221 some other text' + linked_text = add_web_links(example_text) + expected_text = \ + '' + \ + 'doi:10.1109/INFCOMW.2019.8845221' + if expected_text not in linked_text: + print(expected_text + '\n') + print(linked_text) + assert expected_text in linked_text + example_text = \ 'This post has a very long web link\n\nhttp://' + \ 'cbwebewuvfuftdiudbqd33dddbbyuef23fyug3bfhcyu2fct2' + \ diff --git a/webapp_podcast.py b/webapp_podcast.py index 4388b4d83..e54739dbc 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -299,7 +299,7 @@ def html_podcast_episode(translate: {}, text_mode_banner: str, access_keys: {}, session, session_onion, session_i2p, http_prefix: str, debug: bool) -> str: - """Returns html for a podcast episode, giebn an item from the newswire + """Returns html for a podcast episode, an item from the newswire """ css_filename = base_dir + '/epicyon-podcast.css' if os.path.isfile(base_dir + '/podcast.css'):