Last active
September 29, 2020 08:08
-
-
Save iamsarthakjoshi/09c676c343f2b0ab612b0da242fef515 to your computer and use it in GitHub Desktop.
Count Keywords and Display Frequency Bar Chart
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
You need to install poppler for pdftotext (for macOS) | |
> brew install pkg-config poppler | |
@JOSHI, Sarthak | |
PyPi: https://pypi.org/project/pdftotext/ | |
""" | |
import os | |
import re | |
import traceback | |
import pdftotext | |
import collections | |
import matplotlib.pyplot as plt | |
from copy import copy | |
def get_word_frequency( | |
keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True | |
): | |
if not keywords: | |
return "No Keywords Given!" | |
if not directory: | |
return "No Directory Path Given!" | |
print(f"Reading max pages of: {read_max_pages}") | |
print(f"Show Graph: {show_graph}") | |
try: | |
_keywords = [rf"\b{keyword}\b" for keyword in keywords] | |
print("KEYS", _keywords) | |
re_pattern = rf"""\W*({"|".join(_keywords)})\W*""" | |
overall_text = "" | |
count_files = 0 | |
print(f"Regex pattern: {re_pattern}") | |
for filename in os.listdir(directory): | |
if filename.endswith(".pdf"): | |
filepath = os.path.join(directory, filename) | |
with open(filepath, "rb") as f: | |
pdfFileObj = pdftotext.PDF(f, raw=True) | |
total_pages = len(pdfFileObj) | |
if read_max_pages and not total_pages < 10: | |
total_pages = read_max_pages | |
print(f"Reading max pages updated to: {total_pages}\n") | |
count_files += 1 | |
count_pages = 0 | |
print(f"{count_files} - {filename} - Completed") | |
while count_pages < total_pages: | |
pageObj = pdfFileObj[count_pages] | |
count_pages += 1 | |
overall_text += pageObj | |
else: | |
print("No PDF files found") | |
matched_text = re.findall( | |
pattern=re_pattern, | |
string=overall_text.lower(), | |
flags=re.MULTILINE | re.IGNORECASE, | |
) | |
occurrences = collections.Counter(matched_text) | |
if not occurrences: | |
return "No occurences of keywords found." | |
print(f"Ocurrences: {occurrences}") | |
if occurrences and show_graph: | |
fig, ax = plt.subplots() | |
y_list = list(occurrences.keys()) | |
x_list = list(occurrences.values()) | |
# get max occuring keyword | |
occurrences_dict = dict(occurrences) | |
max_value_key = max( | |
occurrences_dict.keys(), key=(lambda key: occurrences_dict[key]) | |
) | |
max_value = occurrences_dict[max_value_key] | |
if squeeze_data: | |
if max_value <= 1000 and max_value >= 500: | |
max_value = 1000 | |
deduct_by = 300 | |
else: | |
deduct_by = (0.3 / 10) * max_value | |
# squeezing the data by 30% if any value is greater than half of max_value | |
x_list = list( | |
[ | |
(int(item) / max_value) * 100 | |
if int(item) < max_value #/ 2 | |
else ((int(item) - deduct_by) / max_value) * 100 | |
for item in occurrences.values() | |
] | |
) | |
else: | |
x_list = list( | |
[(int(item) / max_value) * 100 for item in occurrences.values()] | |
) | |
# check if keyword is not found | |
if len(keywords) > len(y_list): | |
zero_occ_keywords = [ | |
item for item in keywords if item.lower() not in y_list | |
] | |
print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}") | |
# capitalize keywords | |
keywords = [keyword.title() for keyword in y_list] | |
ax.barh(y_list, x_list, align="center", color="#70AD47", zorder=3) | |
ax.grid(zorder=0) | |
ax.set_yticks(y_list) | |
ax.set_yticklabels(keywords) | |
ax.invert_yaxis() # labels read top-to-bottom | |
ax.set_ylabel("Keyword") | |
ax.set_title("Keyword Frequency Table") | |
plt.show() | |
else: | |
return occurrences | |
except Exception as e: | |
print(f"This is an error Message at: \n\n {traceback.format_exc()}") | |
# set directory where your PDF files are kept | |
directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs" | |
# directory = "./pdfs" | |
# Set your Keywords here, replace "dataset", "data", "deep learning" with your own. | |
keywords = [ | |
"ARIMA", | |
"CNN", | |
"GCN", | |
"LSTM", | |
"pems", | |
"accuracy", | |
"computational", | |
"deep learning", | |
"datasets", | |
"geographical similarity", | |
"Intelligent transportation system", | |
"Loop Detectors", | |
"neural network", | |
"Spatio-temporal", | |
"Sensors", | |
"Traffic data", | |
"Training data", | |
"traffic speed prediction", | |
"traffic flow prediction", | |
] | |
# Don't worry about this | |
get_word_frequency( | |
keywords=keywords, directory=directory, show_graph=True, read_max_pages=False, squeeze_data=True | |
) | |
""" | |
1) Make sure you have install Python 3.x | |
2) Make virtual env wiuth command bellow | |
> python3 -m venv venv | |
3) Activate the virtual env with command below | |
> source venv/bin/activate | |
4) Copy below libs to requirements.txt | |
argcomplete==1.10.0 | |
beautifulsoup4==4.8.0 | |
certifi==2020.6.20 | |
chardet==3.0.4 | |
cycler==0.10.0 | |
docx2txt==0.8 | |
EbookLib==0.17.1 | |
extract-msg==0.23.1 | |
IMAPClient==2.1.0 | |
Js2Py==0.70 | |
kiwisolver==1.2.0 | |
lxml==4.5.2 | |
matplotlib==3.3.2 | |
numpy==1.19.2 | |
olefile==0.46 | |
pandas==1.1.2 | |
pdfminer==20191125 | |
pdfminer.six==20181108 | |
pdftotext==2.1.5 | |
Pillow==7.2.0 | |
pycryptodome==3.9.8 | |
pyjsparser==2.7.1 | |
pyparsing==2.4.7 | |
PyPDF2==1.26.0 | |
python-dateutil==2.8.1 | |
python-pptx==0.6.18 | |
pytz==2020.1 | |
six==1.12.0 | |
sortedcontainers==2.2.2 | |
soupsieve==2.0.1 | |
SpeechRecognition==3.8.1 | |
textract==1.6.3 | |
tzlocal==1.5.1 | |
xlrd==1.2.0 | |
XlsxWriter==1.3.6 | |
5) Install all the dependencies with command below | |
> pip install -r requirements.txt | |
6) Run below command to get the frequency chart | |
> python count_keywords_frequency.py | |
Parameters: | |
1) keywords: list | |
You can put your keywords in 'keywords' list | |
2) directory: str | |
You can put your directory where you pdf files are stored | |
3) show_graph: bool | |
You can disable showing graph with Flase and enable with True | |
4) read_max_pages: int | |
You can set maximum pages to be read, default is 10 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment