epicyon/poison.py

2086 lines
30 KiB
Python

__filename__ = "poison.py"
__author__ = "Bob Mottram"
__license__ = "AGPL3+"
__version__ = "1.5.0"
__maintainer__ = "Bob Mottram"
__email__ = "bob@libreserver.org"
__status__ = "Production"
__module_group__ = "Core"
import os
import random
from random import randint
common_nouns = (
"time",
"way",
"year",
"work",
"government",
"day",
"man",
"world",
"life",
"part",
"house",
"course",
"case",
"system",
"place",
"end",
"group",
"company",
"party",
"information",
"school",
"fact",
"money",
"point",
"example",
"state",
"business",
"night",
"area",
"water",
"thing",
"family",
"head",
"hand",
"order",
"john",
"side",
"home",
"development",
"week",
"power",
"country",
"council",
"use",
"service",
"room",
"market",
"problem",
"court",
"lot",
"war",
"police",
"interest",
"car",
"law",
"road",
"form",
"face",
"education",
"policy",
"research",
"sort",
"office",
"body",
"person",
"health",
"mother",
"question",
"period",
"name",
"book",
"level",
"child",
"control",
"society",
"minister",
"view",
"door",
"line",
"community",
"south",
"city",
"god",
"father",
"centre",
"effect",
"staff",
"position",
"kind",
"job",
"woman",
"action",
"management",
"act",
"process",
"north",
"age",
"evidence",
"idea",
"west",
"support",
"moment",
"sense",
"report",
"mind",
"church",
"morning",
"death",
"change",
"industry",
"land",
"care",
"century",
"range",
"table",
"back",
"trade",
"history",
"study",
"street",
"committee",
"rate",
"word",
"food",
"language",
"experience",
"result",
"team",
"other",
"sir",
"section",
"programme",
"air",
"authority",
"role",
"reason",
"price",
"town",
"class",
"nature",
"subject",
"department",
"union",
"bank",
"member",
"value",
"need",
"east",
"practice",
"type",
"paper",
"date",
"decision",
"figure",
"right",
"wife",
"president",
"university",
"friend",
"club",
"quality",
"voice",
"lord",
"stage",
"king",
"us",
"situation",
"light",
"tax",
"production",
"march",
"secretary",
"art",
"board",
"may",
"hospital",
"month",
"music",
"cost",
"field",
"award",
"issue",
"bed",
"project",
"chapter",
"girl",
"game",
"amount",
"basis",
"knowledge",
"approach",
"series",
"love",
"top",
"news",
"front",
"future",
"manager",
"account",
"computer",
"security",
"rest",
"labour",
"structure",
"hair",
"bill",
"heart",
"force",
"attention",
"movement",
"success",
"letter",
"agreement",
"capital",
"analysis",
"population",
"environment",
"performance",
"model",
"material",
"theory",
"growth",
"fire",
"chance",
"boy",
"relationship",
"son",
"sea",
"record",
"size",
"property",
"space",
"term",
"director",
"plan",
"behaviour",
"treatment",
"energy",
"peter",
"income",
"cup",
"scheme",
"design",
"response",
"association",
"choice",
"pressure",
"hall",
"couple",
"technology",
"defence",
"list",
"chairman",
"loss",
"activity",
"contract",
"county",
"wall",
"paul",
"difference",
"army",
"hotel",
"sun",
"product",
"summer",
"set",
"village",
"colour",
"floor",
"season",
"unit",
"park",
"hour",
"investment",
"test",
"garden",
"husband",
"employment",
"style",
"science",
"look",
"deal",
"charge",
"help",
"economy",
"new",
"page",
"risk",
"advice",
"event",
"picture",
"commission",
"fish",
"college",
"oil",
"doctor",
"opportunity",
"film",
"conference",
"operation",
"application",
"press",
"extent",
"addition",
"station",
"window",
"shop",
"access",
"region",
"doubt",
"majority",
"degree",
"television",
"blood",
"statement",
"sound",
"election",
"parliament",
"site",
"mark",
"importance",
"title",
"species",
"increase",
"return",
"concern",
"public",
"competition",
"software",
"glass",
"lady",
"answer",
"earth",
"daughter",
"purpose",
"responsibility",
"leader",
"river",
"eye",
"ability",
"appeal",
"opposition",
"campaign",
"respect",
"task",
"instance",
"sale",
"whole",
"officer",
"method",
"division",
"source",
"piece",
"pattern",
"lack",
"disease",
"equipment",
"surface",
"oxford",
"demand",
"post",
"mouth",
"radio",
"provision",
"attempt",
"sector",
"firm",
"status",
"peace",
"variety",
"teacher",
"show",
"speaker",
"baby",
"arm",
"base",
"miss",
"safety",
"trouble",
"culture",
"direction",
"context",
"character",
"box",
"discussion",
"past",
"weight",
"organisation",
"start",
"brother",
"league",
"condition",
"machine",
"argument",
"sex",
"budget",
"english",
"transport",
"share",
"mum",
"cash",
"principle",
"exchange",
"aid",
"library",
"version",
"rule",
"tea",
"balance",
"afternoon",
"reference",
"protection",
"truth",
"district",
"turn",
"smith",
"review",
"minute",
"duty",
"survey",
"presence",
"influence",
"stone",
"dog",
"benefit",
"collection",
"executive",
"speech",
"function",
"queen",
"marriage",
"stock",
"failure",
"kitchen",
"student",
"effort",
"holiday",
"career",
"attack",
"length",
"horse",
"progress",
"plant",
"visit",
"relation",
"ball",
"memory",
"bar",
"opinion",
"quarter",
"impact",
"scale",
"race",
"image",
"trust",
"justice",
"edge",
"gas",
"railway",
"expression",
"advantage",
"gold",
"wood",
"network",
"text",
"forest",
"sister",
"chair",
"cause",
"foot",
"rise",
"half",
"winter",
"corner",
"insurance",
"step",
"damage",
"credit",
"pain",
"possibility",
"legislation",
"strength",
"speed",
"crime",
"hill",
"debate",
"will",
"supply",
"present",
"confidence",
"mary",
"patient",
"wind",
"solution",
"band",
"museum",
"farm",
"pound",
"henry",
"match",
"assessment",
"message",
"football",
"animal",
"skin",
"scene",
"article",
"stuff",
"introduction",
"play",
"administration",
"fear",
"dad",
"proportion",
"island",
"contact",
"japan",
"claim",
"kingdom",
"video",
"tv",
"existence",
"telephone",
"move",
"traffic",
"distance",
"relief",
"cabinet",
"unemployment",
"reality",
"target",
"trial",
"rock",
"concept",
"spirit",
"accident",
"organization",
"construction",
"coffee",
"phone",
"distribution",
"train",
"sight",
"difficulty",
"factor",
"exercise",
"weekend",
"battle",
"prison",
"grant",
"aircraft",
"tree",
"bridge",
"strategy",
"contrast",
"communication",
"background",
"shape",
"wine",
"star",
"hope",
"selection",
"detail",
"user",
"path",
"client",
"search",
"master",
"rain",
"offer",
"goal",
"dinner",
"freedom",
"attitude",
"while",
"agency",
"seat",
"manner",
"favour",
"fig",
"pair",
"crisis",
"smile",
"prince",
"danger",
"call",
"capacity",
"output",
"note",
"procedure",
"theatre",
"tour",
"recognition",
"middle",
"absence",
"sentence",
"package",
"track",
"card",
"sign",
"commitment",
"player",
"threat",
"weather",
"element",
"conflict",
"notice",
"victory",
"bottom",
"finance",
"fund",
"violence",
"file",
"profit",
"standard",
"jack",
"route",
"china",
"expenditure",
"second",
"discipline",
"cell",
"reaction",
"castle",
"congress",
"individual",
"lead",
"consideration",
"debt",
"option",
"payment",
"exhibition",
"reform",
"emphasis",
"spring",
"audience",
"feature",
"touch",
"estate",
"assembly",
"volume",
"youth",
"contribution",
"curriculum",
"appearance",
"martin",
"tom",
"boat",
"institute",
"membership",
"branch",
"bus",
"waste",
"heat",
"neck",
"object",
"captain",
"driver",
"challenge",
"conversation",
"occasion",
"code",
"crown",
"birth",
"silence",
"literature",
"faith",
"hell",
"entry",
"transfer",
"gentleman",
"bag",
"coal",
"investigation",
"leg",
"belief",
"total",
"major",
"document",
"description",
"murder",
"aim",
"manchester",
"flight",
"conclusion",
"drug",
"tradition",
"pleasure",
"connection",
"owner",
"treaty",
"tony",
"alan",
"desire",
"professor",
"copy",
"ministry",
"acid",
"palace",
"address",
"institution",
"lunch",
"generation",
"partner",
"engine",
"newspaper",
"cross",
"reduction",
"welfare",
"definition",
"key",
"release",
"vote",
"examination",
"judge",
"atmosphere",
"leadership",
"sky",
"breath",
"creation",
"row",
"guide",
"milk",
"cover",
"screen",
"intention",
"criticism",
"jones",
"silver",
"customer",
"journey",
"explanation",
"green",
"measure",
"brain",
"significance",
"phase",
"injury",
"run",
"coast",
"technique",
"valley",
"drink",
"magazine",
"potential",
"drive",
"revolution",
"bishop",
"settlement",
"christ",
"metal",
"motion",
"index",
"adult",
"inflation",
"sport",
"surprise",
"pension",
"factory",
"tape",
"flow",
"iron",
"trip",
"lane",
"pool",
"independence",
"hole",
"flat",
"content",
"pay",
"noise",
"combination",
"session",
"appointment",
"fashion",
"consumer",
"accommodation",
"temperature",
"mike",
"religion",
"author",
"nation",
"northern",
"sample",
"assistance",
"interpretation",
"aspect",
"display",
"shoulder",
"agent",
"gallery",
"republic",
"cancer",
"proposal",
"sequence",
"simon",
"ship",
"interview",
"vehicle",
"democracy",
"improvement",
"involvement",
"general",
"enterprise",
"van",
"meal",
"breakfast",
"motor",
"channel",
"impression",
"tone",
"sheet",
"pollution",
"bob",
"beauty",
"square",
"vision",
"spot",
"distinction",
"brown",
"crowd",
"fuel",
"desk",
"sum",
"decline",
"revenue",
"fall",
"diet",
"bedroom",
"soil",
"reader",
"shock",
"fruit",
"behalf",
"deputy",
"roof",
"nose",
"steel",
"artist",
"graham",
"plate",
"song",
"maintenance",
"formation",
"grass",
"spokesman",
"ice",
"talk",
"program",
"link",
"ring",
"expert",
"establishment",
"plastic",
"candidate",
"rail",
"passage",
"joe",
"parish",
"emergency",
"liability",
"identity",
"location",
"framework",
"strike",
"countryside",
"map",
"lake",
"household",
"approval",
"border",
"bottle",
"bird",
"constitution",
"autumn",
"cat",
"agriculture",
"concentration",
"guy",
"dress",
"victim",
"mountain",
"editor",
"theme",
"error",
"loan",
"stress",
"recovery",
"electricity",
"recession",
"wealth",
"request",
"comparison",
"lewis",
"white",
"walk",
"focus",
"chief",
"parent",
"sleep",
"mass",
"jane",
"bush",
"foundation",
"bath",
"item",
"lifespan",
"publication",
"decade",
"beach",
"sugar",
"height",
"charity",
"writer",
"panel",
"struggle",
"dream",
"outcome",
"efficiency",
"offence",
"resolution",
"reputation",
"specialist",
"taylor",
"pub",
"cooperation",
"port",
"incident",
"representation",
"bread",
"chain",
"initiative",
"clause",
"resistance",
"mistake",
"worker",
"advance",
"empire",
"notion",
"mirror",
"delivery",
"chest",
"licence",
"frank",
"average",
"awareness",
"travel",
"expansion",
"block",
"alternative",
"chancellor",
"meat",
"store",
"self",
"break",
"drama",
"corporation",
"currency",
"extension",
"convention",
"partnership",
"skill",
"furniture",
"round",
"regime",
"inquiry",
"rugby",
"philosophy",
"scope",
"gate",
"minority",
"intelligence",
"restaurant",
"consequence",
"mill",
"golf",
"retirement",
"priority",
"plane",
"gun",
"gap",
"core",
"uncle",
"fun",
"arrival",
"snow",
"no",
"command",
"abuse",
"limit",
"championship"
)
common_words = (
"you",
"I",
"to",
"the",
"a",
"and",
"that",
"it",
"of",
"me",
"what",
"is",
"in",
"this",
"know",
"I'm",
"for",
"no",
"have",
"my",
"don't",
"just",
"not",
"do",
"be",
"on",
"your",
"was",
"we",
"it's",
"with",
"so",
"but",
"all",
"well",
"are"
"he"
"oh",
"about",
"right",
"you're",
"get",
"here",
"out",
"going",
"like",
"yeah",
"if",
"her",
"she",
"can",
"up",
"want",
"think",
"that's",
"now",
"go",
"him",
"at",
"how",
"got",
"there",
"one",
"did",
"why",
"see",
"come",
"good",
"they",
"really",
"as",
"would",
"look",
"when",
"time",
"will",
"okay",
"back",
"can't",
"mean",
"tell",
"I'll",
"from",
"hey",
"were",
"he's",
"could",
"didn't",
"yes",
"his",
"been",
"or",
"something",
"who",
"because",
"some",
"had",
"then",
"say",
"ok",
"take",
"an",
"way",
"us",
"little",
"make",
"need",
"gonna",
"never",
"we're",
"too",
"love",
"she's",
"I've",
"sure",
"them",
"more",
"over",
"our",
"sorry",
"where",
"what's",
"let",
"thing",
"am",
"maybe",
"down",
"man",
"has",
"uh",
"very",
"by",
"there's",
"should",
"anything",
"said",
"much",
"any",
"life",
"even",
"off",
"please",
"doing",
"thank",
"give",
"only",
"thought",
"help",
"two",
"talk",
"people",
"god",
"still",
"wait",
"into",
"find",
"nothing",
"again",
"things",
"let's",
"doesn't",
"call",
"told",
"great",
"before",
"better",
"ever",
"night",
"than",
"away",
"first",
"believe",
"other",
"feel",
"everything",
"work",
"you've",
"fine",
"home",
"after",
"last",
"these",
"day",
"keep",
"does",
"put",
"around",
"stop",
"they're",
"I'd",
"guy",
"long",
"isn't",
"always",
"listen",
"wanted",
"Mr",
"guys",
"huh",
"those",
"big",
"lot",
"happened",
"thanks",
"won't",
"trying",
"kind",
"wrong",
"through",
"talking",
"made",
"new",
"being",
"guess",
"hi",
"care",
"bad",
"mom",
"remember",
"getting",
"we'll",
"together",
"dad",
"leave",
"mother",
"place",
"understand",
"wouldn't",
"actually",
"hear",
"baby",
"nice",
"father",
"else",
"stay",
"done",
"wasn't",
"their",
"course",
"might",
"mind",
"every",
"enough",
"try",
"hell",
"came",
"someone",
"you'll",
"own",
"family",
"whole",
"another",
"house",
"jack",
"yourself",
"idea",
"ask",
"best",
"must",
"coming",
"old",
"looking",
"woman",
"hello",
"which",
"years",
"room",
"money",
"left",
"knew",
"tonight",
"real",
"son",
"hope",
"name",
"same",
"went",
"um",
"hmm",
"happy",
"pretty",
"saw",
"girl",
"sir",
"show",
"friend",
"already",
"saying",
"may",
"next",
"three",
"job",
"problem",
"minute",
"found",
"world",
"thinking",
"haven't",
"heard",
"honey",
"matter",
"myself",
"couldn't",
"exactly",
"having",
"ah",
"probably",
"happen",
"we've",
"hurt",
"boy",
"both",
"while",
"dead",
"gotta",
"alone",
"since",
"excuse",
"start",
"kill",
"hard",
"you'd",
"today",
"car",
"ready",
"until",
"without",
"whatever",
"wants",
"hold",
"wanna",
"yet",
"seen",
"deal",
"took",
"once",
"gone",
"called",
"morning",
"supposed",
"friends",
"head",
"stuff",
"most",
"used",
"worry",
"second",
"part",
"live",
"truth",
"school",
"face",
"forget",
"TRUE",
"business",
"each",
"cause",
"soon",
"knows",
"few",
"telling",
"wife",
"who's",
"use",
"chance",
"run",
"move",
"anyone",
"person",
"bye",
"somebody",
"dr",
"heart",
"such",
"miss",
"married",
"point",
"later",
"making",
"meet",
"anyway",
"many",
"phone",
"reason",
"damn",
"lost",
"looks",
"bring",
"case",
"turn",
"wish",
"tomorrow",
"kids",
"trust",
"check",
"change",
"end",
"late",
"anymore",
"five",
"least",
"town",
"aren't",
"ha",
"working",
"year",
"makes",
"taking",
"means",
"brother",
"play",
"hate",
"ago",
"says",
"beautiful",
"gave",
"fact",
"crazy",
"party",
"sit",
"open",
"afraid",
"between",
"important",
"rest",
"fun",
"kid",
"word",
"watch",
"glad",
"everyone",
"days",
"sister",
"minutes",
"everybody",
"bit",
"couple",
"whoa",
"either",
"mrs",
"feeling",
"daughter",
"wow",
"gets",
"asked",
"under",
"break",
"promise",
"door",
"set",
"close",
"hand",
"easy",
"question",
"doctor",
"tried",
"far",
"walk",
"needs",
"trouble",
"mine",
"though",
"times",
"different",
"killed",
"hospital",
"anybody",
"sam",
"alright",
"wedding",
"shut",
"able",
"die",
"perfect",
"police",
"stand",
"comes",
"hit",
"story",
"ya",
"mm",
"waiting",
"dinner",
"against",
"funny",
"husband",
"almost",
"stupid",
"pay",
"answer",
"four",
"office",
"cool",
"eyes",
"news",
"child",
"shouldn't",
"half",
"side",
"yours",
"moment",
"sleep",
"read",
"where's",
"started",
"young",
"men",
"sounds",
"sonny",
"lucky",
"pick",
"sometimes",
"bed",
"also",
"date",
"line",
"plan",
"hours",
"lose",
"fire",
"free",
"hands",
"serious",
"shit",
"behind",
"inside",
"high",
"ahead",
"week",
"wonderful",
"fight",
"past",
"cut",
"quite",
"number",
"he'll",
"sick",
"it'll",
"game",
"eat",
"nobody",
"goes",
"death",
"along",
"save",
"seems",
"finally",
"lives",
"worried",
"upset",
"met",
"book",
"brought",
"seem",
"sort",
"safe",
"living",
"children",
"weren't",
"leaving",
"front",
"shot",
"loved",
"asking",
"running",
"clear",
"figure",
"hot",
"felt",
"six",
"parents",
"drink",
"absolutely",
"how's",
"daddy",
"sweet",
"alive",
"Paul",
"sense",
"meant",
"happens",
"David",
"special",
"bet",
"blood",
"ain't",
"kidding",
"lie",
"full",
"meeting",
"dear",
"coffee",
"seeing",
"sound",
"fault",
"water",
"fuck",
"ten",
"women",
"john",
"welcome",
"buy",
"months",
"hour",
"speak",
"lady",
"jen",
"thinks",
"Christmas",
"body",
"order",
"outside",
"hang",
"possible",
"worse",
"company",
"mistake",
"ooh",
"handle",
"spend",
"totally",
"giving",
"control",
"here's",
"marriage",
"realize",
"power",
"president",
"unless",
"sex",
"girls",
"send",
"needed",
"taken",
"died",
"scared",
"picture",
"talked",
"jake",
"ass",
"hundred",
"changed",
"completely",
"explain",
"playing",
"certainly",
"sign",
"boys",
"relationship",
"loves",
"fucking",
"hair",
"lying",
"choice",
"anywhere",
"secret",
"future",
"weird",
"luck",
"she'll",
"max",
"Luis",
"turned",
"known",
"touch",
"kiss",
"crane",
"questions",
"obviously",
"wonder",
"pain",
"calling",
"somewhere",
"throw",
"straight",
"grace",
"cold",
"white",
"fast",
"words",
"food",
"none",
"drive",
"feelings",
"they'll",
"worked",
"marry",
"light",
"test",
"drop",
"cannot",
"frank",
"sent",
"city",
"dream",
"protect",
"twenty",
"class",
"lucy",
"surprise",
"its",
"sweetheart",
"forever",
"poor",
"looked",
"mad",
"except",
"gun",
"dance",
"takes",
"appreciate",
"especially",
"situation",
"besides",
"weeks",
"pull",
"himself",
"hasn't",
"act",
"worth",
"Sheridan",
"amazing",
"top",
"given",
"expect",
"ben",
"rather",
"Julian",
"involved",
"swear",
"piece",
"busy",
"law",
"decided",
"black",
"joey",
"happening",
"movie",
"we'd",
"catch",
"antonio",
"country",
"less",
"perhaps",
"step",
"fall",
"watching",
"kept",
"darling",
"dog",
"win",
"air",
"honor",
"personal",
"moving",
"till",
"admit",
"problems",
"murder",
"strong",
"he'd",
"evil",
"definitely",
"feels",
"information",
"honest",
"eye",
"broke",
"missed",
"longer",
"dollars",
"tired",
"evening",
"human",
"starting",
"Ross",
"red",
"entire",
"trip",
"club",
"suppose",
"calm",
"imagine",
"fair",
"caught",
"blame",
"street",
"sitting",
"favor",
"apartment",
"court",
"terrible",
"clean",
"tony",
"learn",
"Alison",
"Rick",
"works",
"relax",
"york",
"million",
"charity",
"accident",
"wake",
"prove",
"Danny",
"smart",
"message",
"missing",
"forgot",
"small",
"interested",
"table",
"nbsp",
"become",
"craig",
"mouth",
"pregnant",
"middle",
"billy",
"ring",
"careful",
"shall",
"dude",
"team",
"ride",
"figured",
"wear",
"shoot",
"stick",
"ray",
"follow",
"angry",
"instead",
"buddy",
"write",
"stopped",
"early",
"angel",
"nick",
"ran",
"war",
"standing",
"forgive",
"jail",
"wearing",
"ladies",
"kinda",
"lunch",
"eight",
"gotten",
"hoping",
"phoebe",
"thousand",
"ridge",
"music",
"luke",
"paper",
"tough",
"tape",
"state",
"count",
"college",
"boyfriend",
"proud",
"agree",
"birthday",
"bill",
"seven",
"they've",
"Timmy",
"history",
"share",
"offer",
"hurry",
"feet",
"wondering",
"simple",
"decision",
"building",
"ones",
"finish",
"voice",
"herself",
"Chris",
"would've",
"list",
"mess",
"deserve",
"evidence",
"cute",
"dress",
"Richard",
"interesting",
"Jesus",
"hotel",
"enjoy",
"Ryan",
"Lindsay",
"quiet",
"concerned",
"road",
"eve",
"staying",
"short",
"beat",
"sweetie",
"mention",
"clothes",
"finished",
"fell",
"neither",
"fix",
"victor",
"respect",
"spent",
"prison",
"attention",
"holding",
"calls",
"near",
"surprised",
"bar",
"beth",
"pass",
"keeping",
"gift",
"hadn't",
"putting",
"dark",
"self",
"owe",
"using",
"nora",
"ice",
"helping",
"bitch",
"normal",
"aunt",
"lawyer",
"apart",
"certain",
"plans",
"girlfriend",
"floor",
"whether",
"everything's",
"present",
"earth",
"private",
"box",
"Dawson",
"cover",
"judge",
"upstairs",
"sake",
"mommy",
"possibly",
"worst"
)
def load_dictionary(base_dir: str) -> []:
"""Loads a dictionary from file
"""
filename = base_dir + '/custom_dictionary.txt'
if not os.path.isfile(filename):
filename = base_dir + '/dictionary.txt'
if not os.path.isfile(filename):
return []
words = []
try:
with open(filename, 'r', encoding='utf-8') as fp_dict:
words = fp_dict.read().split('\n')
except OSError:
print('EX: unable to load dictionary ' + filename)
return words
def load_2grams(base_dir: str) -> {}:
"""Loads 2-grams from file
"""
filename = base_dir + '/custom_2grams.txt'
if not os.path.isfile(filename):
filename = base_dir + '/2grams.txt'
if not os.path.isfile(filename):
return {}
twograms = {}
lines = []
try:
with open(filename, 'r', encoding='utf-8') as fp_dict:
lines = fp_dict.read().split('\n')
except OSError:
print('EX: unable to load 2-grams ' + filename)
for line_str in lines:
words = line_str.split('\t')
if len(words) != 3:
continue
first_word = words[1]
second_word = words[2]
if twograms.get(first_word):
if second_word in twograms[first_word]:
continue
twograms[first_word].append(second_word)
else:
twograms[first_word] = [second_word]
return twograms
def html_poisoned(dictionary: [], twograms: {}) -> str:
"""Returns a poisoned HTML document for LLM response
Statistically similar to English language, but semantically worthless
word salad
"""
html_str = \
'<html lang="en">' + \
'<head>' + \
'<meta charset="utf-8">' + \
'</head>' + \
'<body>'
no_of_common_words = len(common_words) - 1
paragraphs = randint(1, 5)
for _ in range(paragraphs):
html_str += '<p>'
sentences = randint(1, 5)
for sentence_index in range(sentences):
sentence_str = ''
no_of_words = randint(3, 20)
prev_wrd = ''
for word_index in range(no_of_words):
wrd = ''
pair_found = False
if prev_wrd:
# common word sequences
if twograms.get(prev_wrd) and \
randint(1, 10) <= 7:
wrd = random.choice(twograms[prev_wrd])
pair_found = True
if not pair_found:
if randint(1, 100) <= 37:
# pick a common noun
wrd = random.choice(common_nouns)
else:
if randint(1, 10) <= 7:
# pick a common word
distribution = random.uniform(0.0, 1.0)
common_index = \
int(distribution * distribution *
no_of_common_words)
wrd = common_words[common_index]
else:
wrd = random.choice(dictionary)
if word_index > 0:
sentence_str += wrd
else:
sentence_str += wrd.title()
if randint(1, 10) > 1 or \
word_index >= no_of_words - 1 or pair_found:
sentence_str += ' '
else:
sentence_str += ', '
prev_wrd = wrd
if sentence_index > 0:
html_str += ' ' + sentence_str.strip() + '.'
else:
html_str += sentence_str.strip() + '.'
html_str += '</p>'
html_str += '</body></html>'
return html_str