Skip to content

Instantly share code, notes, and snippets.

@lord-alfred
Created November 28, 2017 13:00
Show Gist options
  • Save lord-alfred/05b16a79ad6f56018933558afb7877cb to your computer and use it in GitHub Desktop.
Save lord-alfred/05b16a79ad6f56018933558afb7877cb to your computer and use it in GitHub Desktop.
Simple keywords cleaner. Clean stopwords and similar sound words
<?php
/*
* Author: Lord_Alfred
* Blog: https://vk.com/lord.alfred
*/
error_reporting(E_ALL);
ini_set('display_errors', true);
ini_set('memory_limit', '256M');
set_time_limit(0);
// Settings start
define('KEYWORDS_FILE_INPUT', 'keywords.txt');
define('KEYWORDS_FILE_OUTPUT', 'keywords_out.txt');
define('ONE_WORD_LENGTH_MIN', 3);
define('KEYWORD_LENGTH_MAX', 40);
// Settings end
$data = file_get_contents(__DIR__ . DIRECTORY_SEPARATOR . KEYWORDS_FILE_INPUT);
$stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't", "cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's", "its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves", "a's", "able", "according", "accordingly", "across", "actually", "afterwards", "ain't", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "another", "anybody", "anyhow", "anyone", "anything", "anyway", "anyways", "anywhere", "apart", "appear", "appreciate", "appropriate", "around", "aside", "ask", "asking", "associated", "available", "away", "awfully", "became", "become", "becomes", "becoming", "beforehand", "behind", "believe", "beside", "besides", "best", "better", "beyond", "brief", "c'mon", "c's", "came", "can", "cant", "cause", "causes", "certain", "certainly", "changes", "clearly", "co", "com", "come", "comes", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "course", "currently", "definitely", "described", "despite", "different", "done", "downwards", "edu", "eg", "eight", "either", "else", "elsewhere", "enough", "entirely", "especially", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "far", "fifth", "first", "five", "followed", "following", "follows", "former", "formerly", "forth", "four", "furthermore", "get", "gets", "getting", "given", "gives", "go", "goes", "going", "gone", "got", "gotten", "greetings", "happens", "hardly", "hello", "help", "hence", "hereafter", "hereby", "herein", "hereupon", "hi", "hither", "hopefully", "howbeit", "however", "ie", "ignored", "immediate", "inasmuch", "inc", "indeed", "indicate", "indicated", "indicates", "inner", "insofar", "instead", "inward", "it'd", "it'll", "just", "keep", "keeps", "kept", "know", "known", "knows", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "like", "liked", "likely", "little", "look", "looking", "looks", "ltd", "mainly", "many", "may", "maybe", "mean", "meanwhile", "merely", "might", "moreover", "mostly", "much", "must", "name", "namely", "nd", "near", "nearly", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "nobody", "non", "none", "noone", "normally", "nothing", "novel", "now", "nowhere", "obviously", "often", "oh", "ok", "okay", "old", "one", "ones", "onto", "others", "otherwise", "outside", "overall", "particular", "particularly", "per", "perhaps", "placed", "please", "plus", "possible", "presumably", "probably", "provides", "que", "quite", "qv", "rather", "rd", "re", "really", "reasonably", "regarding", "regardless", "regards", "relatively", "respectively", "right", "said", "saw", "say", "saying", "says", "second", "secondly", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "shall", "since", "six", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specified", "specify", "specifying", "still", "sub", "sup", "sure", "t's", "take", "taken", "tell", "tends", "th", "thank", "thanks", "thanx", "thats", "thence", "thereafter", "thereby", "therefore", "therein", "theres", "thereupon", "think", "third", "thorough", "thoroughly", "though", "three", "throughout", "thru", "thus", "together", "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "twice", "two", "un", "unfortunately", "unless", "unlikely", "unto", "upon", "us", "use", "used", "useful", "uses", "using", "usually", "value", "various", "via", "viz", "vs", "want", "wants", "way", "welcome", "well", "went", "whatever", "whence", "whenever", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "whither", "whoever", "whole", "whose", "will", "willing", "wish", "within", "without", "wonder", "yes", "yet", "zero", "www", "abst", "accordance", "act", "added", "adj", "affected", "affecting", "affects", "ah", "announce", "anymore", "apparently", "approximately", "aren", "arent", "arise", "auth", "b", "back", "begin", "beginning", "beginnings", "begins", "biol", "briefly", "c", "ca", "couldnt", "d", "date", "due", "e", "ed", "effect", "eighty", "end", "ending", "et-al", "f", "ff", "fix", "found", "g", "gave", "give", "giving", "h", "hed", "heres", "hes", "hid", "home", "hundred", "id", "im", "immediately", "importance", "important", "index", "information", "invention", "itd", "j", "k", "kg", "km", "l", "largely", "lets", "line", "'ll", "m", "made", "make", "makes", "means", "meantime", "mg", "million", "miss", "ml", "mr", "mrs", "mug", "n", "na", "nay", "necessarily", "ninety", "nonetheless", "nos", "noted", "o", "obtain", "obtained", "omitted", "ord", "owing", "p", "page", "pages", "part", "past", "poorly", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "promptly", "proud", "put", "q", "quickly", "r", "ran", "readily", "recent", "recently", "ref", "refs", "related", "research", "resulted", "resulting", "results", "run", "s", "sec", "section", "shed", "shes", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "slightly", "somethan", "specifically", "stop", "strongly", "substantially", "successfully", "sufficiently", "suggest", "t", "taking", "that'll", "that've", "thered", "there'll", "thereof", "therere", "thereto", "there've", "theyd", "theyre", "thou", "thoughh", "thousand", "throug", "til", "tip", "ts", "u", "unlike", "ups", "usefully", "usefulness", "v", "'ve", "vol", "vols", "w", "wasnt", "wed", "werent", "what'll", "whats", "wheres", "whim", "whod", "who'll", "whomever", "whos", "widely", "wont", "words", "world", "wouldnt", "x", "y", "youd", "youre", "z", "amoungst", "amount", "bill", "bottom", "call", "computer", "con", "cry", "de", "describe", "detail", "eleven", "empty", "fifteen", "fify", "fill", "find", "fire", "forty", "front", "full", "hasnt", "herse", "himse", "interest", "itse", "mill", "mine", "move", "myse", "side", "sincere", "sixty", "system", "ten", "thick", "thin", "top", "twelve", "twenty", "dear", "tis", "twas", "'tis", "'twas", "10", "39", "ableabout", "abroad", "ad", "adopted", "ae", "af", "ag", "ago", "ahead", "ai", "aint", "al", "alongside", "amid", "amidst", "ao", "aq", "ar", "area", "areas", "arpa", "asked", "asks", "au", "aw", "az", "ba", "backed", "backing", "backs", "backward", "backwards", "bb", "bd", "began", "beings", "bf", "bg", "bh", "bi", "big", "billion", "bj", "bm", "bn", "bo", "br", "bs", "bt", "buy", "bv", "bw", "bz", "caption", "case", "cases", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "clear", "click", "cm", "cmon", "cn", "co.", "copy", "could've", "couldn", "cr", "cs", "cu", "cv", "cx", "cy", "cz", "dare", "daren't", "darent", "didn", "didnt", "differ", "differently", "directly", "dj", "dk", "dm", "doesn", "doesnt", "don", "dont", "doubtful", "downed", "downing", "downs", "dz", "early", "ec", "ee", "eh", "ended", "ends", "er", "es", "evenly", "evermore", "face", "faces", "fact", "facts", "fairly", "farther", "felt", "fewer", "fi", "fifty", "finds", "fj", "fk", "fm", "fo", "forever", "forward", "fr", "free", "fully", "furthered", "furthering", "furthers", "fx", "ga", "gb", "gd", "ge", "general", "generally", "gf", "gg", "gh", "gi", "gl", "gm", "gmt", "gn", "good", "goods", "gov", "gp", "gq", "gr", "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "gs", "gt", "gu", "gw", "gy", "hadnt", "half", "hasn", "haven", "havent", "hell", "herse”", "high", "higher", "highest", "himse”", "hk", "hm", "hn", "homepage", "how'd", "how'll", "hr", "ht", "htm", "html", "http", "hu", "i.e.", "ii", "il", "ill", "inc.", "inside", "int", "interested", "interesting", "interests", "io", "iq", "ir", "isn", "isnt", "itll", "itse”", "ive", "je", "jm", "jo", "join", "jp", "ke", "keys", "kh", "ki", "kind", "kn", "knew", "kp", "kr", "kw", "ky", "kz", "la", "large", "latest", "lb", "lc", "length", "li", "likewise", "lk", "ll", "long", "longer", "longest", "low", "lower", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "making", "man", "mayn't", "maynt", "mc", "md", "member", "members", "men", "mh", "microsoft", "might've", "mightn't", "mightnt", "mil", "minus", "mk", "mm", "mn", "mo", "mp", "mq", "ms", "msie", "mt", "mu", "must've", "mustnt", "mv", "mw", "mx", "myse”", "mz", "nc", "ne", "needed", "needing", "needn't", "neednt", "net", "netscape", "neverf", "neverless", "newer", "newest", "nf", "ng", "ni", "nl", "no-one", "notwithstanding", "np", "nr", "nu", "null", "number", "numbers", "nz", "older", "oldest", "om", "one's", "open", "opened", "opening", "opens", "opposite", "order", "ordered", "ordering", "orders", "org", "oughtn't", "oughtnt", "pa", "parted", "parting", "parts", "pe", "pf", "pg", "ph", "pk", "pl", "place", "places", "pm", "pmid", "pn", "point", "pointed", "pointing", "points", "pr", "presented", "presenting", "presents", "problem", "problems", "provided", "pt", "puts", "pw", "py", "qa", "reserved", "ring", "ro", "room", "rooms", "round", "ru", "rw", "sa", "sb", "sc", "sd", "se", "seconds", "sees", "seventy", "sg", "sh", "shant", "shell", "should've", "shouldn", "shouldnt", "showing", "si", "sides", "site", "sj", "sk", "sl", "sm", "small", "smaller", "smallest", "sn", "someday", "sr", "st", "state", "states", "su", "sv", "sy", "sz", "tc", "td", "test", "text", "tf", "tg", "thatll", "thatve", "there'd", "there're", "therell", "thereve", "theyll", "theyve", "thing", "things", "thinks", "thirty", "thought", "thoughts", "till", "tj", "tk", "tm", "tn", "today", "tp", "tr", "trillion", "tt", "turn", "turned", "turning", "turns", "tv", "tw", "tz", "ua", "ug", "uk", "um", "underneath", "undoing", "upwards", "uucp", "uy", "uz", "va", "vc", "ve", "versus", "vg", "vi", "vn", "vu", "wanted", "wanting", "wasn", "ways", "web", "webpage", "website", "wells", "weren", "weve", "wf", "what'd", "what've", "whatll", "whatve", "when'd", "when'll", "where'd", "where'll", "whichever", "whilst", "who'd", "wholl", "why'd", "why'll", "width", "won", "work", "worked", "working", "works", "would've", "wouldn", "ws", "ye", "year", "years", "youll", "young", "younger", "youngest", "youve", "yt", "yu", "za", "zm", "zr"];
// prepare
$data = str_replace(" ", " ", $data);
$data = str_replace(" ", " ", $data);
$data = str_replace(" ", " ", $data);
$data = str_replace("\r\n", "\n", $data);
$data = str_replace("\r", "\n", $data);
$data = str_replace(" \n", "\n", $data);
$data = str_replace("www.", "", $data);
$data = str_replace(".com ", " ", $data);
$data = trim($data);
// to array
$keywords_array = explode("\n", $data);
unset($data);
// result & hash arrays
$result_array = array();
$hash_array = array();
// go checking all keywords
foreach($keywords_array as $kw) {
$kw = mb_strtolower($kw);
$kw = trim($kw);
$words = explode(" ", $kw);
$kw_new = array();
$kw_hash = array();
// check words in keyword
foreach ($words as $w) {
if (!in_array($w, $stopwords)) {
if ((mb_strlen($w) >= ONE_WORD_LENGTH_MIN) || is_int($w)) {
$kw_new[] = $w;
$kw_hash[] = soundex($w);
}
}
}
if (sizeof($kw_new) > 0) {
$kw_new_str = implode(" ", $kw_new);
if (mb_strlen($kw_new_str) < KEYWORD_LENGTH_MAX) {
$kw_hash = array_unique($kw_hash);
sort($kw_hash);
$kw_hash_str = implode('', $kw_hash);
if (!in_array($kw_hash_str, $hash_array)) {
$hash_array[] = $kw_hash_str;
$result_array[] = $kw_new_str;
}
unset($kw_hash_str);
}
unset($kw_new_str);
}
// GC
unset($words);
unset($kw_new);
unset($kw_hash);
}
// GC
unset($keywords_array);
unset($hash_array);
usort($result_array, function($a, $b) {
return mb_strlen($a) - mb_strlen($b);
});
// write cleaned keywords
$data = implode("\n", $result_array);
file_put_contents(__DIR__ . DIRECTORY_SEPARATOR . KEYWORDS_FILE_OUTPUT, $data);
echo "Done";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment