Created
May 11, 2016 20:12
-
-
Save rijvirajib/26720565e5ab947c4a5a0f5ab6371ab0 to your computer and use it in GitHub Desktop.
Not-Optimized Keyword Density Script in Perl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use strict; | |
use warnings; | |
use Text::Trim qw(trim); | |
use List::MoreUtils qw(any); | |
use HTML::Entities qw(encode_entities); | |
use HTML::StripTags qw(strip_tags); | |
our @STOPWORDS = ( | |
'a','about','above','above','across','after','afterwards','again','against','all', | |
'almost','alone','along','already','also','although','always','am','among','amongst','amoungst', | |
'amount', 'an','and','another','any','anyhow','anyone','anything','anyway','anywhere','are', | |
'around','as', 'at','back','be','became','because','become','becomes','becoming','been', | |
'before','beforehand','behind','being','below','beside','besides','between','beyond','bill', | |
'both','bottom','but','by','call','can','cannot','cant','co','con','could','couldnt','cry', | |
'de','describe','detail','do','done','down','due','during','each','eg','eight','either', | |
'eleven','else','elsewhere','empty','enough','etc','even','ever','every','everyone','everything', | |
'everywhere','except','few','fifteen','fify','fill','find','fire','first','five','for','former', | |
'formerly','forty','found','four','from','front','full','further','get','give','go','had', | |
'has','hasnt','have','he','hence','her','here','hereafter','hereby','herein','hereupon','hers', | |
'herself','him','himself','his','how','however','hundred','ie','if','in','inc','indeed','interest', | |
'into','is','it','its','itself','keep','last','latter','latterly','least','less','ltd','made', | |
'many','may','me','meanwhile','might','mill','mine','more','moreover','most','mostly','move', | |
'much','must','my','myself','name','namely','neither','never','nevertheless','next','nine','no', | |
'nobody','none','noone','nor','not','nothing','now','nowhere','of','off','often','on','once', | |
'one','only','onto','or','other','others','otherwise','our','ours','ourselves','out','over', | |
'own','part','per','perhaps','please','put','rather','re','same','see','seem','seemed','seeming', | |
'seems','serious','several','she','should','show','side','since','sincere','six','sixty','so', | |
'some','somehow','someone','something','sometime','sometimes','somewhere','still','such','system', | |
'take','ten','than','that','the','their','them','themselves','then','thence','there','thereafter', | |
'thereby','therefore','therein','thereupon','these','they','thickv','thin','third','this','those', | |
'though','three','through','throughout','thru','thus','to','together','too','top','toward', | |
'towards','twelve','twenty','two','un','under','until','up','upon','us','very','via','was','we', | |
'well','were','what','whatever','when','whence','whenever','where','whereafter','whereas','whereby', | |
'wherein','whereupon','wherever','whether','which','while','whither','who','whoever','whole','whom', | |
'whose','why','will','with','within','without','would','yet','you','your','yours','yourself', | |
'yourselves','the', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '' | |
); | |
sub keyword_density { | |
my $self = shift; | |
my $text = shift; | |
my @prevWords = (); | |
my %wordCount = (); | |
my %buckets = (); | |
my $cleanText = clean_text($text); | |
my @words = split(/\s/, $cleanText); | |
for my $word (@words) { | |
$word = lc(trim($word)); | |
## Additional check not needed if cleanText worked right | |
if(any { $_ eq $word } @STOPWORDS) { | |
next; | |
} | |
## Ignore words with 2 length, as, by, if, or, ANY PUNCTUATION | |
if(length($word) <= 2) { | |
next; | |
} | |
if(!$wordCount{$word}) { | |
$wordCount{$word} = 0; | |
} | |
$wordCount{$word}++; | |
push(@prevWords, $word); | |
my $phrase = trim(join(' ', @prevWords)); | |
if(!$wordCount{$phrase}) { | |
$wordCount{$phrase} = 0; | |
} | |
$wordCount{$phrase}++; | |
if(scalar(@prevWords) > 2) { | |
shift(@prevWords); | |
} | |
## One more time for 3 words | |
## TODO: Optimize code | |
$phrase = trim(join(' ', @prevWords)); | |
if(!$wordCount{$phrase}) { | |
$wordCount{$phrase} = 0; | |
} | |
$wordCount{$phrase}++; | |
} | |
## Build out Keyword Density Object | |
for my $key (keys %wordCount) { | |
my $count = $wordCount{$key}; | |
my @words = split(/\s/, $key); | |
my $length = scalar(@words); | |
if(!$buckets{$length}) { | |
$buckets{$length} = (); | |
} | |
$buckets{$length}{$key} = $count; | |
} | |
return \%buckets; | |
} | |
sub clean_text { | |
my $text = shift; | |
$text = strip_tags($text); | |
$text = encode_entities($text); | |
# Remove Entities | |
$text =~ s/ //gi; | |
$text =~ s/&//gi; | |
$text =~ s/"//gi; | |
$text =~ s/'//gi; | |
$text =~ s/'//gi; | |
$text =~ s/<//gi; | |
$text =~ s/>//gi; | |
$text =~ s/&(copy|reg);//gi; | |
# Remove Signs | |
$text =~ s/[']s //g; | |
$text =~ s/['"?!]//g; | |
$text =~ s/[,.]//g; | |
$text =~ s/.?\}\).*\;//g; | |
$text =~ s/.?\)\)\;//g; | |
## Remove STOPWORDS | |
for my $stopword (@STOPWORDS) { | |
$text =~ s/\b$stopword\b//gi; | |
} | |
return $text; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment