Return a poisoned resonse to LLM scrapers

main
Bob Mottram 2024-08-17 19:29:32 +01:00
parent 4c7cbd6341
commit cb069bbed5
5 changed files with 2073 additions and 2 deletions

View File

@ -95,6 +95,7 @@ from httpcodes import write2
from httpheaders import set_headers from httpheaders import set_headers
from daemon_utils import has_accept from daemon_utils import has_accept
from daemon_utils import is_authorized from daemon_utils import is_authorized
from poison import load_dictionary
class PubServer(BaseHTTPRequestHandler): class PubServer(BaseHTTPRequestHandler):
@ -876,6 +877,9 @@ def run_daemon(accounts_data_dir: str,
# timeout used when getting rss feeds # timeout used when getting rss feeds
httpd.rss_timeout_sec = 20 httpd.rss_timeout_sec = 20
# load dictionary used for LLM poisoning
httpd.dictionary = load_dictionary(base_dir)
# timeout used when checking for actor changes when clicking an avatar # timeout used when checking for actor changes when clicking an avatar
# and entering person options screen # and entering person options screen
if check_actor_timeout < 2: if check_actor_timeout < 2:

View File

@ -210,6 +210,7 @@ from daemon_get_blog import show_blog_page
from daemon_get_links import edit_links2 from daemon_get_links import edit_links2
from daemon_get_login import redirect_to_login_screen from daemon_get_login import redirect_to_login_screen
from daemon_get_login import show_login_screen from daemon_get_login import show_login_screen
from poison import html_poisoned
# Blogs can be longer, so don't show many per page # Blogs can be longer, so don't show many per page
MAX_POSTS_IN_BLOGS_FEED = 4 MAX_POSTS_IN_BLOGS_FEED = 4
@ -260,8 +261,13 @@ def daemon_http_get(self) -> None:
# headers used by LLM scrapers # headers used by LLM scrapers
if 'oai-host-hash' in self.headers: if 'oai-host-hash' in self.headers:
print('GET HTTP LLM scraper bounced: ' + str(self.headers)) msg = html_poisoned(self.server.dictionary)
http_402(self) msg = msg.encode('utf-8')
msglen = len(msg)
set_headers(self, 'text/html', msglen,
'', calling_domain, False)
write2(self, msg)
print('GET HTTP LLM scraper poisoned: ' + str(self.headers))
return return
# replace invalid .well-known path, prior to checking for suspicious paths # replace invalid .well-known path, prior to checking for suspicious paths

1000
dictionary.txt 100644

File diff suppressed because it is too large Load Diff

View File

@ -123,6 +123,8 @@ from happening import dav_day_via_server
from content import import_emoji from content import import_emoji
from relationships import get_moved_accounts from relationships import get_moved_accounts
from blocking import get_blocks_via_server from blocking import get_blocks_via_server
from poison import html_poisoned
from poison import load_dictionary
def str2bool(value_str) -> bool: def str2bool(value_str) -> bool:
@ -440,6 +442,11 @@ def _command_options() -> None:
dest='shared_items_federated_domains', dest='shared_items_federated_domains',
help='Specify federation list for shared items, ' + help='Specify federation list for shared items, ' +
'separated by spaces') 'separated by spaces')
parser.add_argument("--poisoned", "--poison",
dest='poisoned',
type=str2bool, nargs='?',
const=True, default=False,
help="Example poisoned output")
parser.add_argument("--following", "--followingList", parser.add_argument("--following", "--followingList",
dest='followingList', dest='followingList',
type=str2bool, nargs='?', type=str2bool, nargs='?',
@ -840,6 +847,14 @@ def _command_options() -> None:
argb = parser.parse_args() argb = parser.parse_args()
if argb.poisoned:
# LLM poisoning example
base_dir = os.getcwd()
dictionary = load_dictionary(base_dir)
poisoned_str = html_poisoned(dictionary)
print(poisoned_str)
sys.exit()
debug = False debug = False
if argb.debug: if argb.debug:
debug = True debug = True

1046
poison.py 100644

File diff suppressed because it is too large Load Diff