From f25cf2121af72e681f9e8c1ae1fd6b6fa6fb5b03 Mon Sep 17 00:00:00 2001 From: Bob Mottram Date: Wed, 4 Jan 2023 13:33:05 +0000 Subject: [PATCH] Link to DOI scientific references --- content.py | 36 +++++++++++++++++++++++++++++++++++- tests.py | 19 +++++++++++++++++++ webapp_podcast.py | 2 +- 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/content.py b/content.py index 31d86978e..c80dd4229 100644 --- a/content.py +++ b/content.py @@ -585,6 +585,25 @@ def _shorten_linked_urls(content: str) -> str: return content +def _contains_doi_reference(wrd: str, replace_dict: {}) -> bool: + """Handle DOI scientific references + """ + if not wrd.startswith('doi:') and \ + not wrd.startswith('DOI:'): + return False + + doi_ref_str = wrd.split(':', 1)[1] + doi_site = 'https://sci-hub.ru' + markup = '' + \ + 'doi:' + doi_ref_str + \ + '' + replace_dict[wrd] = markup + return True + + def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool: """Handle arxiv scientific references """ @@ -620,6 +639,19 @@ def _contains_arxiv_reference(wrd: str, replace_dict: {}) -> bool: return True +def _contains_academic_references(content: str) -> bool: + """Does the given content contain academic references + """ + prefixes = ( + 'arXiv:', 'arx:', 'arxiv:', + 'doi:', 'DOI:' + ) + for reference in prefixes: + if reference in content: + return True + return False + + def add_web_links(content: str) -> str: """Adds markup for web links """ @@ -639,7 +671,7 @@ def add_web_links(content: str) -> str: # if there are no prefixes then just keep the content we have if not prefix_found: - if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content: + if _contains_academic_references(content): prefix_found = True else: return content @@ -652,6 +684,8 @@ def add_web_links(content: str) -> str: continue if _contains_arxiv_reference(wrd, replace_dict): continue + if _contains_doi_reference(wrd, replace_dict): + continue # does the word begin with a prefix? prefix_found = False for prefix in prefixes: diff --git a/tests.py b/tests.py index 2bacd9d6b..c99597fee 100644 --- a/tests.py +++ b/tests.py @@ -3623,6 +3623,11 @@ def _test_web_links(): print(linked_text) assert expected_text in linked_text + # NOTE: it is difficult to find academic studies of the fediverse which + # do not in some way violate consent or embody an arrogant status + # quo attitude. Did all those scraped accounts agree to be part of + # an academic study? Did they even consider consent as an issue? + # It seems doubtful. We are just like algae under a microscope to them. example_text = \ 'This post has an arxiv link arXiv:2203.15752 some other text' linked_text = add_web_links(example_text) @@ -3635,6 +3640,20 @@ def _test_web_links(): print(linked_text) assert expected_text in linked_text + example_text = \ + 'This post has an doi link ' + \ + 'doi:10.1109/INFCOMW.2019.8845221 some other text' + linked_text = add_web_links(example_text) + expected_text = \ + '' + \ + 'doi:10.1109/INFCOMW.2019.8845221' + if expected_text not in linked_text: + print(expected_text + '\n') + print(linked_text) + assert expected_text in linked_text + example_text = \ 'This post has a very long web link\n\nhttp://' + \ 'cbwebewuvfuftdiudbqd33dddbbyuef23fyug3bfhcyu2fct2' + \ diff --git a/webapp_podcast.py b/webapp_podcast.py index 4388b4d83..e54739dbc 100644 --- a/webapp_podcast.py +++ b/webapp_podcast.py @@ -299,7 +299,7 @@ def html_podcast_episode(translate: {}, text_mode_banner: str, access_keys: {}, session, session_onion, session_i2p, http_prefix: str, debug: bool) -> str: - """Returns html for a podcast episode, giebn an item from the newswire + """Returns html for a podcast episode, an item from the newswire """ css_filename = base_dir + '/epicyon-podcast.css' if os.path.isfile(base_dir + '/podcast.css'):