From 17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Sat, 2 Dec 2023 17:10:48 -0500 Subject: Filter out profane words --- data/profane.txt | 1163 +++++++++++++++++++++++++++++++++++++++++++++++ generator/generator.cpp | 27 +- generator/generator.h | 4 +- generator/main.cpp | 7 +- 4 files changed, 1193 insertions(+), 8 deletions(-) create mode 100644 data/profane.txt diff --git a/data/profane.txt b/data/profane.txt new file mode 100644 index 0000000..b07b823 --- /dev/null +++ b/data/profane.txt @@ -0,0 +1,1163 @@ +abortion +alla +allah +alligatorbait +anal +analannie +analsex +anus +arab +arabs +areola +aroused +arse +asian +assbagger +assblaster +assclown +asscowboy +asses +assfuck +assfucker +asshat +asshole +assholes +asshore +assjockey +asskiss +asskisser +assklown +asslick +asslicker +asslover +assman +assmonkey +assmunch +assmuncher +asspacker +asspirate +asspuppies +assranger +asswhore +asswipe +backdoorman +badfuck +balllicker +balls +ballsack +barelylegal +barface +barfface +bast +bastard +bazongas +bazooms +beaner +beastality +beastial +beastiality +beatoff +beat-off +beatyourmeat +bestial +bestiality +biatch +bicurious +bigass +bigbastard +bigbutt +bitch +bitcher +bitches +bitchez +bitchin +bitching +bitchslap +bitchy +biteme +blackman +blackout +blacks +blowjob +boang +bogan +bohunk +bollick +bollock +bombers +bombing +bomd +bondage +boner +bong +boob +boobies +boobs +booby +boody +boong +boonga +boonie +booty +bootycall +bountybar +brea5t +breast +breastjob +breastlover +breastman +brothel +bugger +buggered +buggery +bullcrap +bulldike +bulldyke +bullshit +bumblefuck +bumfuck +bunga +bunghole +butchbabes +butchdike +butchdyke +buttbang +butt-bang +buttface +buttfuck +butt-fuck +buttfucker +butt-fucker +buttfuckers +butt-fuckers +butthead +buttman +buttmunch +buttmuncher +buttpirate +buttplug +buttstain +byatch +cacker +cameljockey +cameltoe +carpetmuncher +carruth +chav +cherrypopper +chickslick +children's +chin +chinaman +chinamen +chinese +chink +chinky +choad +chode +christ +christian +clamdigger +clamdiver +clit +clitoris +clogwog +cock +cockblock +cockblocker +cockcowboy +cockfight +cockhead +cockknob +cocklicker +cocklover +cocknob +cockqueen +cockrider +cocksman +cocksmith +cocksmoker +cocksucer +cocksuck +cocksucked +cocksucker +cocksucking +cocktail +cocktease +cocky +cohee +coitus +colored +coloured +commie +communist +condom +coolie +cooly +coon +coondog +copulate +cornhole +corruption +cra5h +crackpipe +crackwhore +crack-whore +crapola +crapper +crappy +creamy +crotch +crotchjockey +crotchmonkey +crotchrot +cum +cumbubble +cumfest +cumjockey +cumm +cummer +cumming +cumquat +cumqueen +cumshot +cunilingus +cunillingus +cunn +cunnilingus +cunntt +cunt +cunteyed +cuntfuck +cuntfucker +cuntlick +cuntlicker +cuntlicking +cuntsucker +cybersex +cyberslimer +dago +dahmer +damnation +damnit +darkie +darky +datnigga +deapthroat +deepthroat +defecate +dego +deth +devilworshipper +dick +dickbrain +dickforbrains +dickhead +dickless +dicklick +dicklicker +dickman +dickwad +dickweed +diddle +dike +dildo +dingleberry +dipshit +dipstick +dix +dixiedike +dixiedyke +doggiestyle +doggystyle +dong +doodoo +doo-doo +dragqween +dripdick +dumbass +dumbbitch +dumbfuck +dyefly +dyke +easyslut +eatballs +eatme +eatpussy +ecstacy +ejaculate +ejaculated +ejaculating +ejaculation +enema +erect +erection +ero +escort +ethiopian +evl +excrement +facefucker +faeces +fag +fagging +faggot +fagot +fannyfucker +farty +fastfuck +fatah +fatass +fatfuck +fatfucker +fatso +fckcum +feces +felatio +felch +felcher +felching +fellatio +feltch +feltcher +feltching +fetish +filipina +filipino +fingerfood +fingerfuck +fingerfucked +fingerfucker +fingerfuckers +fingerfucking +fister +fistfuck +fistfucked +fistfucker +fistfucking +fisting +flange +flasher +floo +flydie +flydye +fok +fondle +footaction +footfuck +footfucker +footlicker +footstar +foreskin +forni +fornicate +foursome +freakfuck +freakyfucker +freefuck +fu +fubar +fuc +fucck +fuck +fucka +fuckable +fuckbag +fuckbuddy +fucked +fuckedup +fucker +fuckers +fuckface +fuckfest +fuckfreak +fuckfriend +fuckhead +fuckher +fuckin +fuckina +fucking +fuckingbitch +fuckinnuts +fuckinright +fuckit +fuckknob +fuckme +fuckmehard +fuckmonkey +fuckoff +fuckpig +fucks +fucktard +fuckwhore +fuckyou +fudgepacker +fuk +fuks +funfuck +fungus +fuuck +gangbang +gangbanged +gangbanger +gangsta +gatorbait +gaymuthafuckinwhore +gaysex +geni +genital +german +getiton +ginzo +gipp +givehead +glazeddonut +gob +godammit +goddamit +goddammit +goddamn +goddamned +goddamnes +goddamnit +goddamnmuthafucker +goldenshower +gonorrehea +gonzagas +gook +gotohell +goy +goyim +greaseball +gringo +groe +grostulation +gubba +gummer +gyp +gypo +gypp +gyppie +gyppo +gyppy +gypsy +hamas +handjob +hapa +hardon +harem +headfuck +headlights +hebe +heeb +henhouse +heterosexual +hillbillies +hindoo +hiscock +hitler +hitlerism +hitlerist +hiv +ho +hobo +hodgie +hoes +holestuffer +homicide +homobangers +honger +honkers +honkey +honky +hooker +hookers +hooters +hore +hork +horney +horniest +horny +horseshit +hosejob +hoser +hotdamn +hotpussy +hottotrot +hummer +husky +hussy +hustler +hymen +hymie +iblowu +ikey +incest +insest +intercourse +interracial +intheass +inthebuff +israel +israeli +israel's +italiano +jackass +jackoff +jackshit +jacktheripper +jade +jap +japanese +japcrap +jebus +jerkoff +jesus +jesuschrist +jew +jewish +jiga +jigaboo +jigg +jigga +jiggabo +jigger +jiggy +jihad +jijjiboo +jimfish +jism +jiz +jizim +jizjuice +jizm +jizz +jizzim +jizzum +juggalo +jugs +junglebunny +kaffer +kaffir +kaffre +kafir +kanake +kigger +kike +kink +kinky +kissass +kkk +knockers +kock +kondum +koon +kotex +krap +krappy +kraut +kum +kumbubble +kumbullbe +kummer +kumming +kumquat +kums +kunilingus +kunnilingus +kunt +ky +kyke +lactate +lapdance +lesbin +lesbo +lez +lezbe +lezbefriends +lezbo +lezz +lezzo +libido +licker +lickme +limey +limpdick +limy +liquor +livesex +loadedgun +lolita +looser +lovebone +lovegoo +lovegun +lovejuice +lovemuscle +lovepistol +loverocket +lowlife +lsd +lubejob +lucifer +luckycammeltoe +lugan +lynch +macaca +magicwand +mams +manhater +manpaste +marijuana +mastabate +mastabater +masterbate +masterblaster +mastrabator +masturbate +masturbating +mattressprincess +meatbeatter +meatrack +mexican +mgger +mggor +mickeyfinn +mideast +milf +mockey +mockie +mocky +mofo +moky +moles +molest +molestation +molester +molestor +moneyshot +mooncricket +mormon +moslem +mosshead +mothafuck +mothafucka +mothafuckaz +mothafucked +mothafucker +mothafuckin +mothafucking +mothafuckings +motherfuck +motherfucked +motherfucker +motherfuckin +motherfucking +motherfuckings +motherlovebone +muff +muffdive +muffdiver +muffindiver +mufflikcer +mulatto +muncher +munt +muslim +nastybitch +nastyho +nastyslut +nastywhore +nazi +necro +negro +negroes +negroid +negro's +nig +niger +nigerian +nigerians +nigg +nigga +niggah +niggaracci +niggard +niggarded +niggarding +niggardliness +niggardliness's +niggardly +niggards +niggard's +niggaz +nigger +niggerhead +niggerhole +niggers +nigger's +niggle +niggled +niggles +niggling +nigglings +niggor +niggur +niglet +nignog +nigr +nigra +nigre +nip +nipple +nipplering +nittit +nlgger +nlggor +nofuckingway +nookey +nookie +noonan +nooner +nudger +nuke +nutfucker +nymph +ontherag +oral +orga +orgasim +orgasm +orgies +orgy +osama +paki +palesimian +palestinian +pansies +pansy +panti +panties +payo +pearlnecklace +pecker +peckerwood +peehole +pee-pee +peepshow +peepshpw +pendy +penetration +peni5 +penile +penis +penises +penthouse +perv +phonesex +phuk +phuked +phuking +phukked +phukking +phungky +phuq +pi55 +picaninny +piccaninny +pickaninny +piker +pikey +piky +pimper +pimpjuic +pimpjuice +pimpsimp +pindick +pisser +pisses +pisshead +pissin +pissing +pissoff +pistol +pixie +pixy +playboy +playgirl +pocha +pocho +pocketpool +pohm +polack +pom +pommie +pommy +poo +poon +poontang +pooper +pooperscooper +pooping +poorwhitetrash +popimp +porchmonkey +porn +pornflick +pornking +porno +pornography +pornprincess +pric +prick +prickhead +primetime +propaganda +prostitute +protestant +pu55i +pu55y +pube +pubic +pubiclice +pud +pudboy +pudd +puddboy +puntang +purinapricness +puss +pussie +pussies +pussy +pussycat +pussyeater +pussyfucker +pussylicker +pussylips +pussylover +pussypounder +pusy +quashie +queef +quickie +quim +ra8s +rabbi +racial +racist +radical +radicals +raghead +randy +rape +raped +raper +rapist +rearend +rearentry +rectum +redlight +redneck +reefer +reestie +refugee +rentafuck +republican +rere +retard +retarded +ribbed +rigger +rimjob +rimming +roach +robber +roundeye +rump +russki +russkie +sadis +sadom +samckdaddy +sandm +sandnigger +satan +scag +scallywag +scat +schlong +screwyou +scrotum +semen +seppo +servant +sex +sexed +sexfarm +sexhound +sexhouse +sexing +sexkitten +sexpot +sexslave +sextogo +sextoy +sextoys +sexual +sexually +sexwhore +sexy +sexymoma +sexy-slim +shag +shaggin +shagging +shat +shav +shawtypimp +sheeney +shhit +shinola +shitcan +shitdick +shite +shiteater +shited +shitface +shitfaced +shitfit +shitforbrains +shitfuck +shitfucker +shitfull +shithapens +shithappens +shithead +shithouse +shiting +shitlist +shitola +shitoutofluck +shits +shitstain +shitted +shitter +shitting +shortfuck +showtime +sissy +sixsixsix +sixtynine +sixtyniner +skank +skankbitch +skankfuck +skankwhore +skanky +skankybitch +skankywhore +skinflute +skum +skumbag +slanteye +slapper +slav +slave +slavedriver +sleezebag +sleezeball +slideitin +slimebucket +slopehead +slopey +slopy +slut +sluts +slutt +slutting +slutty +slutwear +slutwhore +smack +smackthemonkey +smut +snatch +snatchpatch +snigger +sniggered +sniggering +sniggers +snigger's +sniper +snowback +snownigger +sodom +sodomise +sodomite +sodomize +sodomy +sonofabitch +sonofbitch +sooty +sos +soviet +spaghettibender +spaghettinigger +spankthemonkey +sperm +spermacide +spermbag +spermhearder +spermherder +spic +spick +spig +spigotty +spik +spitter +splittail +spooge +spreadeagle +spunk +spunky +squaw +stagg +stiffy +strapon +stringer +stripclub +stroking +stupidfuck +stupidfucker +suckdick +sucker +suckme +suckmyass +suckmydick +suckmytit +suckoff +suicide +swallower +swalow +swastika +sweetness +syphilis +taff +tampon +tang +tantra +tarbaby +tard +teat +teste +testicle +testicles +thicklips +thirdeye +thirdleg +threesome +threeway +timbernigger +tinkle +tit +titbitnipply +titfuck +titfucker +titfuckin +titjob +titlicker +titlover +tits +tittie +titties +titty +tnt +tongethruster +tonguethrust +tonguetramp +tortur +tosser +towelhead +trailertrash +tramp +trannie +tranny +transexual +transsexual +transvestite +triplex +trisexual +trojan +trots +tuckahoe +tunneloflove +turd +turnon +twat +twink +twinkie +twobitwhore +uck +uk +unfuckable +upskirt +uptheass +upthebutt +usama +uterus +vagina +vaginal +vatican +vibr +vibrater +vibrator +vietcong +virgin +virginbreaker +vulva +wab +wank +wanker +wanking +waysted +weapon +weenie +weewee +welcher +welfare +wetb +wetback +wetspot +whacker +whash +whigger +whiskeydick +whiskydick +whit +whitenigger +whites +whitetrash +whitey +whiz +whop +whore +whorefucker +whorehouse +wigger +willie +williewanker +willy +wn +wog +women's +wop +wtf +wuss +wuzzie +xtc +xxx +yankee +yellowman +zigabo +zipperhead \ No newline at end of file diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include constexpr int MIN_FREQUENCY = 2000000; @@ -49,11 +51,12 @@ std::list readFile(std::string path, bool uniq = false) { generator::generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, - std::string outputPath) + std::string datadirPath, std::string outputPath) : agidPath_(agidPath), wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), wordfreqPath_(wordfreqPath), + datadirPath_(datadirPath), outputPath_(outputPath) { // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { @@ -102,6 +105,14 @@ void generator::run() { } } + std::unordered_set profane; + { + std::list lines(readFile(datadirPath_ / "profane.txt")); + for (const std::string& line : lines) { + profane.insert(line); + } + } + { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); @@ -142,6 +153,11 @@ void generator::run() { continue; } + // Ignore any profane words. + if (profane.count(text)) { + continue; + } + // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); @@ -175,7 +191,8 @@ void generator::run() { } if (!word_by_base_.count(infinitive) && - !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { + !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && + !profane.count(infinitive))) { continue; } @@ -262,8 +279,10 @@ void generator::run() { // Compile the forms we have mapped. for (const std::list& infl_list : inflections) { for (const std::string& infl : infl_list) { - size_t form_id = LookupOrCreateForm(infl); - AddFormToWord(form_id, word_id); + if (!profane.count(infl)) { + size_t form_id = LookupOrCreateForm(infl); + AddFormToWord(form_id, word_id); + } } } } diff --git a/generator/generator.h b/generator/generator.h index a97b0b0..923fc17 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -1,6 +1,7 @@ #ifndef GENERATOR_H_D5C6A724 #define GENERATOR_H_D5C6A724 +#include #include #include #include @@ -22,7 +23,7 @@ class generator { generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, - std::string outputPath); + std::string datadirPath, std::string outputPath); // Action @@ -54,6 +55,7 @@ class generator { std::string wordNetPath_; std::string cmudictPath_; std::string wordfreqPath_; + std::filesystem::path datadirPath_; // Output diff --git a/generator/main.cpp b/generator/main.cpp index c958421..94bf0a1 100644 --- a/generator/main.cpp +++ b/generator/main.cpp @@ -4,20 +4,21 @@ #include "generator.h" void printUsage() { - std::cout << "usage: generator agid wordnet cmudict wordfreq output" + std::cout << "usage: generator agid wordnet cmudict wordfreq datadir output" << std::endl; std::cout << "agid :: path to an AGID infl.txt file" << std::endl; std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; + std::cout << "datadir :: path to the Lingo Randomizer datadir" << std::endl; std::cout << "output :: datafile output path" << std::endl; } int main(int argc, char** argv) { - if (argc == 6) { + if (argc == 7) { try { - generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); + generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); try { app.run(); -- cgit 1.4.1