Skip to content

Instantly share code, notes, and snippets.

@rijvirajib
Created May 11, 2016 20:12
Show Gist options
  • Save rijvirajib/26720565e5ab947c4a5a0f5ab6371ab0 to your computer and use it in GitHub Desktop.
Save rijvirajib/26720565e5ab947c4a5a0f5ab6371ab0 to your computer and use it in GitHub Desktop.
Not-Optimized Keyword Density Script in Perl
use strict;
use warnings;
use Text::Trim qw(trim);
use List::MoreUtils qw(any);
use HTML::Entities qw(encode_entities);
use HTML::StripTags qw(strip_tags);
our @STOPWORDS = (
'a','about','above','above','across','after','afterwards','again','against','all',
'almost','alone','along','already','also','although','always','am','among','amongst','amoungst',
'amount', 'an','and','another','any','anyhow','anyone','anything','anyway','anywhere','are',
'around','as', 'at','back','be','became','because','become','becomes','becoming','been',
'before','beforehand','behind','being','below','beside','besides','between','beyond','bill',
'both','bottom','but','by','call','can','cannot','cant','co','con','could','couldnt','cry',
'de','describe','detail','do','done','down','due','during','each','eg','eight','either',
'eleven','else','elsewhere','empty','enough','etc','even','ever','every','everyone','everything',
'everywhere','except','few','fifteen','fify','fill','find','fire','first','five','for','former',
'formerly','forty','found','four','from','front','full','further','get','give','go','had',
'has','hasnt','have','he','hence','her','here','hereafter','hereby','herein','hereupon','hers',
'herself','him','himself','his','how','however','hundred','ie','if','in','inc','indeed','interest',
'into','is','it','its','itself','keep','last','latter','latterly','least','less','ltd','made',
'many','may','me','meanwhile','might','mill','mine','more','moreover','most','mostly','move',
'much','must','my','myself','name','namely','neither','never','nevertheless','next','nine','no',
'nobody','none','noone','nor','not','nothing','now','nowhere','of','off','often','on','once',
'one','only','onto','or','other','others','otherwise','our','ours','ourselves','out','over',
'own','part','per','perhaps','please','put','rather','re','same','see','seem','seemed','seeming',
'seems','serious','several','she','should','show','side','since','sincere','six','sixty','so',
'some','somehow','someone','something','sometime','sometimes','somewhere','still','such','system',
'take','ten','than','that','the','their','them','themselves','then','thence','there','thereafter',
'thereby','therefore','therein','thereupon','these','they','thickv','thin','third','this','those',
'though','three','through','throughout','thru','thus','to','together','too','top','toward',
'towards','twelve','twenty','two','un','under','until','up','upon','us','very','via','was','we',
'well','were','what','whatever','when','whence','whenever','where','whereafter','whereas','whereby',
'wherein','whereupon','wherever','whether','which','while','whither','who','whoever','whole','whom',
'whose','why','will','with','within','without','would','yet','you','your','yours','yourself',
'yourselves','the', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ''
);
sub keyword_density {
my $self = shift;
my $text = shift;
my @prevWords = ();
my %wordCount = ();
my %buckets = ();
my $cleanText = clean_text($text);
my @words = split(/\s/, $cleanText);
for my $word (@words) {
$word = lc(trim($word));
## Additional check not needed if cleanText worked right
if(any { $_ eq $word } @STOPWORDS) {
next;
}
## Ignore words with 2 length, as, by, if, or, ANY PUNCTUATION
if(length($word) <= 2) {
next;
}
if(!$wordCount{$word}) {
$wordCount{$word} = 0;
}
$wordCount{$word}++;
push(@prevWords, $word);
my $phrase = trim(join(' ', @prevWords));
if(!$wordCount{$phrase}) {
$wordCount{$phrase} = 0;
}
$wordCount{$phrase}++;
if(scalar(@prevWords) > 2) {
shift(@prevWords);
}
## One more time for 3 words
## TODO: Optimize code
$phrase = trim(join(' ', @prevWords));
if(!$wordCount{$phrase}) {
$wordCount{$phrase} = 0;
}
$wordCount{$phrase}++;
}
## Build out Keyword Density Object
for my $key (keys %wordCount) {
my $count = $wordCount{$key};
my @words = split(/\s/, $key);
my $length = scalar(@words);
if(!$buckets{$length}) {
$buckets{$length} = ();
}
$buckets{$length}{$key} = $count;
}
return \%buckets;
}
sub clean_text {
my $text = shift;
$text = strip_tags($text);
$text = encode_entities($text);
# Remove Entities
$text =~ s/&nbsp;//gi;
$text =~ s/&amp;//gi;
$text =~ s/&quot;//gi;
$text =~ s/&apos;//gi;
$text =~ s/&#39;//gi;
$text =~ s/&lt;//gi;
$text =~ s/&gt;//gi;
$text =~ s/&(copy|reg);//gi;
# Remove Signs
$text =~ s/[']s //g;
$text =~ s/['"?!]//g;
$text =~ s/[,.]//g;
$text =~ s/.?\}\).*\;//g;
$text =~ s/.?\)\)\;//g;
## Remove STOPWORDS
for my $stopword (@STOPWORDS) {
$text =~ s/\b$stopword\b//gi;
}
return $text;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment