Return a poisoned resonse to LLM scrapers

2024-08-17 19:29:32 +01:00 · 2024-08-17 19:29:32 +01:00 · cb069bbed5
parent 4c7cbd6341
commit cb069bbed5
5 changed files with 2073 additions and 2 deletions
--- a/daemon.py
+++ b/daemon.py
@ -95,6 +95,7 @@ from httpcodes import write2
 from httpheaders import set_headers
 from daemon_utils import has_accept
 from daemon_utils import is_authorized
+from poison import load_dictionary


 class PubServer(BaseHTTPRequestHandler):
@ -876,6 +877,9 @@ def run_daemon(accounts_data_dir: str,
    # timeout used when getting rss feeds
    httpd.rss_timeout_sec = 20

+    # load dictionary used for LLM poisoning
+    httpd.dictionary = load_dictionary(base_dir)
+
    # timeout used when checking for actor changes when clicking an avatar
    # and entering person options screen
    if check_actor_timeout < 2:
--- a/daemon_get.py
+++ b/daemon_get.py
@ -210,6 +210,7 @@ from daemon_get_blog import show_blog_page
 from daemon_get_links import edit_links2
 from daemon_get_login import redirect_to_login_screen
 from daemon_get_login import show_login_screen
+from poison import html_poisoned

 # Blogs can be longer, so don't show many per page
 MAX_POSTS_IN_BLOGS_FEED = 4
@ -260,8 +261,13 @@ def daemon_http_get(self) -> None:

    # headers used by LLM scrapers
    if 'oai-host-hash' in self.headers:
-        print('GET HTTP LLM scraper bounced: ' + str(self.headers))
-        http_402(self)
+        msg = html_poisoned(self.server.dictionary)
+        msg = msg.encode('utf-8')
+        msglen = len(msg)
+        set_headers(self, 'text/html', msglen,
+                    '', calling_domain, False)
+        write2(self, msg)
+        print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
        return

    # replace invalid .well-known path, prior to checking for suspicious paths
--- a/dictionary.txt
+++ b/dictionary.txt
--- a/epicyon.py
+++ b/epicyon.py
@ -123,6 +123,8 @@ from happening import dav_day_via_server
 from content import import_emoji
 from relationships import get_moved_accounts
 from blocking import get_blocks_via_server
+from poison import html_poisoned
+from poison import load_dictionary


 def str2bool(value_str) -> bool:
@ -440,6 +442,11 @@ def _command_options() -> None:
                        dest='shared_items_federated_domains',
                        help='Specify federation list for shared items, ' +
                        'separated by spaces')
+    parser.add_argument("--poisoned", "--poison",
+                        dest='poisoned',
+                        type=str2bool, nargs='?',
+                        const=True, default=False,
+                        help="Example poisoned output")
    parser.add_argument("--following", "--followingList",
                        dest='followingList',
                        type=str2bool, nargs='?',
@ -840,6 +847,14 @@ def _command_options() -> None:

    argb = parser.parse_args()

+    if argb.poisoned:
+        # LLM poisoning example
+        base_dir = os.getcwd()
+        dictionary = load_dictionary(base_dir)
+        poisoned_str = html_poisoned(dictionary)
+        print(poisoned_str)
+        sys.exit()
+
    debug = False
    if argb.debug:
        debug = True
--- a/poison.py
+++ b/poison.py