Export blogs in gemini format

2025-11-18 17:55:40 +00:00 · 2025-11-18 17:55:40 +00:00 · 8e7dc3b23e
parent a577d59341
commit 8e7dc3b23e
3 changed files with 176 additions and 5 deletions
--- a/gemini.py
+++ b/gemini.py
@ -0,0 +1,132 @@
+__filename__ = "gemini.py"
+__author__ = "Bob Mottram"
+__license__ = "AGPL3+"
+__version__ = "1.6.0"
+__maintainer__ = "Bob Mottram"
+__email__ = "bob@libreserver.org"
+__status__ = "Production"
+__module_group__ = "Timeline"
+
+import os
+import shutil
+from utils import acct_dir
+from utils import has_object_dict
+from utils import remove_html
+from utils import get_summary_from_post
+from utils import get_base_content_from_post
+
+
+def blog_to_gemini(base_dir: str, nickname: str, domain: str,
+                   message_json: dict, system_language: str,
+                   debug: bool, testing: bool) -> bool:
+    """
+    Converts a blog post to gemini format
+    Returns True on success
+    """
+    if not testing:
+        account_dir = acct_dir(base_dir, nickname, domain)
+    else:
+        account_dir = base_dir
+        if os.path.isdir(account_dir + '/geminitest'):
+            shutil.rmtree(account_dir + '/geminitest', ignore_errors=True)
+
+    if not os.path.isdir(account_dir):
+        if debug:
+            print('WARN: blog_to_gemini account directory not found ' +
+                  account_dir)
+        return False
+
+    # get the publication date
+    obj = message_json
+    if has_object_dict(message_json):
+        obj = message_json['object']
+    if not obj.get('published'):
+        if debug:
+            print('WARN: blog_to_gemini Blog post has no publication date ' +
+                  str(message_json))
+        return False
+    if not isinstance(obj['published'], str):
+        if debug:
+            print('WARN: blog_to_gemini publication date is not a string ' +
+                  str(message_json))
+        return False
+    if 'T' not in obj['published']:
+        if debug:
+            print('WARN: blog_to_gemini ' +
+                  'publication date not in expected format ' +
+                  obj['published'])
+        return False
+    published = obj['published'].split('T')[0]
+
+    # get the blog content
+    content_str = get_base_content_from_post(message_json, system_language)
+    if not content_str:
+        if debug:
+            print('WARN: blog_to_gemini no content ' +
+                  str(message_json))
+        return False
+    content_text = remove_html(content_str)
+
+    # get the blog title
+    title_text = ''
+    title_str = get_summary_from_post(message_json, system_language, [])
+    if title_str:
+        title_text = remove_html(title_str)
+
+    # get web links
+    if '://' in content_text:
+        sections = content_text.split('://')
+        ctr = 0
+        prev_section = ''
+        links: list[str] = []
+        for section in sections:
+            if ctr > 0:
+                link_str = section
+                if '\n' in link_str:
+                    link_str = link_str.split('\n')[0]
+                if ' ' in link_str:
+                    link_str = link_str.split(' ')[0]
+                if link_str.endswith('.'):
+                    link_length = len(link_str)
+                    link_str = link_str[:link_length-1]
+                if '.' not in link_str:
+                    continue
+                prefix = prev_section.rsplit(' ', 1)[-1]
+                if prefix in ('http', 'https', 'gemini'):
+                    link_str = prefix + '://' + link_str
+                    links.append(link_str)
+            prev_section = section
+            ctr += 1
+
+        # add links to the end of the content
+        if links:
+            content_text += '\n\n'
+        for link_str in links:
+            content_text += '=> ' + link_str + '\n'
+
+    # create gemini blog directory
+    if not testing:
+        gemini_blog_dir = account_dir + '/gemini'
+    else:
+        gemini_blog_dir = account_dir + '/geminitest'
+    if not os.path.isdir(gemini_blog_dir):
+        os.mkdir(gemini_blog_dir)
+
+    title_text2 = title_text.replace('.', ' ')
+    title_text2 = title_text2.replace(' ', '_')
+    gemini_blog_filename = \
+        gemini_blog_dir + '/' + published + '_' + title_text2.lower() + '.gmi'
+
+    if not title_text.startswith('# '):
+        title_text = '# ' + title_text
+
+    try:
+        with open(gemini_blog_filename, 'w+',
+                  encoding='utf-8') as fp_gemini:
+            fp_gemini.write(title_text + '\n\n' + published + '\n\n' +
+                            content_text)
+    except OSError:
+        print('EX: blog_to_gemini unable to write ' + gemini_blog_filename)
+        return False
+
+    return True
--- a/outbox.py
+++ b/outbox.py
@ -68,6 +68,7 @@ from speaker import update_speaker
 from reading import store_book_events
 from reading import has_edition_tag
 from inbox_receive import inbox_update_index
+from gemini import blog_to_gemini


 def _localonly_not_local(message_json: {}, domain_full: str) -> bool:
@ -544,6 +545,11 @@ def post_message_to_outbox(session, translate: {},
        print('WARN: post not saved to outbox ' + outbox_name)
        return False

+    if outbox_name == 'tlblogs':
+        # export blog post in gemini format
+        blog_to_gemini(base_dir, post_to_nickname, domain,
+                       message_json, system_language, debug, False)
+
    # update the speaker endpoint for speech synthesis
    actor_url = get_actor_from_post(message_json)
    update_speaker(base_dir, http_prefix,
--- a/tests.py
+++ b/tests.py
@ -8,17 +8,17 @@ __status__ = "Production"
 __module_group__ = "Testing"

 import base64
+import time
+import os
+import shutil
+import json
+import datetime
 from cryptography.hazmat.primitives import hashes
 from cryptography.hazmat.backends import default_backend
 from cryptography.hazmat.primitives.serialization import load_pem_private_key
 from cryptography.hazmat.primitives.serialization import load_pem_public_key
 from cryptography.hazmat.primitives.asymmetric import padding
 from cryptography.hazmat.primitives.asymmetric import utils as hazutils
-import time
-import os
-import shutil
-import json
-import datetime
 from shutil import copyfile
 from random import randint
 from time import gmtime, strftime
@ -234,6 +234,7 @@ from webapp_utils import add_emoji_to_display_name
 from blocking import is_blocked_nickname
 from blocking import is_blocked_domain
 from filters import filtered_match
+from gemini import blog_to_gemini


 TEST_SERVER_GROUP_RUNNING = False
@ -9490,6 +9491,37 @@ def _test_actor_status() -> None:
    assert not actor_status_expired(actor['sm:status'])


+def _test_gemini_blog(base_dir: str) -> None:
+    print('gemini_blog')
+    gemini_blog_dir = base_dir + '/geminitest'
+    published = '2022-02-25T20:15:00Z'
+    title = 'Test title'
+    content = 'This is a test'
+    link = 'https://some.link'
+    gemini_blog_filename = \
+        gemini_blog_dir + '/2022-02-25_' + \
+        title.replace(' ', '_').lower() + '.gmi'
+    system_language = 'en'
+    debug = True
+    message_json = {
+        'object': {
+            'published': published,
+            'summary': title,
+            'content': content + ' ' + link
+        }
+    }
+    result = blog_to_gemini(base_dir, 'someuser', 'somedomain',
+                            message_json, system_language,
+                            debug, True)
+    assert result
+    assert os.path.isdir(gemini_blog_dir)
+    assert os.path.isfile(gemini_blog_filename)
+    assert text_in_file('# ' + title + '\n', gemini_blog_filename)
+    assert text_in_file(content, gemini_blog_filename)
+    assert text_in_file('=> ' + link, gemini_blog_filename)
+    shutil.rmtree(gemini_blog_dir, ignore_errors=True)
+
+
 def run_all_tests():
    base_dir = os.getcwd()
    data_dir_testing(base_dir)
@ -9508,6 +9540,7 @@ def run_all_tests():
    _test_checkbox_names()
    _test_thread_functions()
    _test_functions()
+    _test_gemini_blog(base_dir)
    _test_actor_status()
    _test_filter_match()
    _test_blocking_domain(base_dir)