From 9a7b95eca4208bd53281cdff08dcda827c9c7594 Mon Sep 17 00:00:00 2001
From: Bob Mottram <bob@libreserver.org>
Date: Wed, 4 Jan 2023 11:53:15 +0000
Subject: [PATCH] Support arxiv scientific publication references

---
 content.py | 30 +++++++++++++++++++++++++++++-
 tests.py   | 14 +++++++++++++-
 2 files changed, 42 insertions(+), 2 deletions(-)
diff --git a/content.py b/content.py
index a1320d4b7..042827fb1 100644
--- a/content.py
+++ b/content.py
@@ -604,7 +604,10 @@ def add_web_links(content: str) -> str:
 
     # if there are no prefixes then just keep the content we have
     if not prefix_found:
-        return content
+        if 'arXiv:' in content or 'arx:' in content or 'arxiv:' in content:
+            prefix_found = True
+        else:
+            return content
 
     content = content.replace('\r', '')
     words = content.replace('\n', ' --linebreak-- ').split(' ')
@@ -612,6 +615,31 @@ def add_web_links(content: str) -> str:
     for wrd in words:
         if ':' not in wrd:
             continue
+        # handle arxiv scientific references
+        if wrd.startswith('arXiv:') or \
+           wrd.startswith('arx:') or \
+           wrd.startswith('arxiv:'):
+            arxiv_ref_str = wrd.split(':', 1)[1].lower()
+            if '.' in arxiv_ref_str:
+                arxiv_ref = arxiv_ref_str.split('.')
+            elif ':' in arxiv_ref_str:
+                arxiv_ref = arxiv_ref_str.split(':')
+            else:
+                continue
+            if len(arxiv_ref) == 2:
+                arxiv_day = arxiv_ref[1]
+                if 'v' in arxiv_day:
+                    arxiv_day = arxiv_day.split('v')[0]
+                if arxiv_ref[0].isdigit() and arxiv_day.isdigit():
+                    ref_str = arxiv_ref[0] + '.' + arxiv_ref[1]
+                    markup = '<a href="https://arxiv.org/abs/' + \
+                        ref_str + '" tabindex="10" ' + \
+                        'rel="nofollow noopener noreferrer" ' + \
+                        'target="_blank">' + \
+                        '<span class="ellipsis">arXiv:' + ref_str + \
+                        '</span></a>'
+                    replace_dict[wrd] = markup
+            continue
         # does the word begin with a prefix?
         prefix_found = False
         for prefix in prefixes:
diff --git a/tests.py b/tests.py
index da05e6d37..a15bf87f2 100644
--- a/tests.py
+++ b/tests.py
@@ -3617,7 +3617,19 @@ def _test_web_links():
         '<a href="https://somesite.net" tabindex="10" ' + \
         'rel="nofollow noopener noreferrer"' + \
         ' target="_blank"><span class="invisible">https://' + \
-        '</span><span class="ellipsis">somesite.net</span></a'
+        '</span><span class="ellipsis">somesite.net</span></a>'
+    if expected_text not in linked_text:
+        print(expected_text + '\n')
+        print(linked_text)
+    assert expected_text in linked_text
+
+    example_text = \
+        'This post has an arxiv link arXiv:2212.14672 some other text'
+    linked_text = add_web_links(example_text)
+    expected_text = \
+        '<a href="https://arxiv.org/abs/2212.14672" tabindex="10" ' + \
+        'rel="nofollow noopener noreferrer"' + \
+        ' target="_blank"><span class="ellipsis">arXiv:2212.14672</span></a>'
     if expected_text not in linked_text:
         print(expected_text + '\n')
         print(linked_text)