Last active
September 29, 2020 08:57
-
-
Save iamsarthakjoshi/db75ed8e2114f6cf18cf396a97445ede to your computer and use it in GitHub Desktop.
Count Keywords Frequency and Generate Chart with 'pdftotext' and 'matplotlib'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
You need to install poppler for pdftotext (for macOS) | |
> brew install pkg-config poppler | |
@JOSHI, Sarthak | |
PyPi: https://pypi.org/project/pdftotext/ | |
""" | |
import os | |
import re | |
import traceback | |
import pdftotext | |
import collections | |
import matplotlib.pyplot as plt | |
from copy import copy | |
def get_word_frequency( | |
keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True, bar_color="#70AD47" | |
): | |
if not keywords: | |
return "No Keywords Given!" | |
if not directory: | |
return "No Directory Path Given!" | |
list_of_files_dirs = os.listdir(directory) | |
if len(list_of_files_dirs) < 1: | |
print("====> No PDF files found <====") | |
return "No PDF files found" | |
print(f"Reading max pages of: {read_max_pages}") | |
print(f"Show Graph: {show_graph}") | |
print(f"Total PDFS Found: {len(list_of_files_dirs)}\n") | |
try: | |
_keywords = [rf"\b{keyword}\b" for keyword in keywords] | |
print("KEYS", _keywords) | |
re_pattern = rf"""\W*({"|".join(_keywords)})\W*""" | |
overall_text = "" | |
count_files = 0 | |
print(f"Regex pattern: {re_pattern}\n") | |
print("Counting started, please wait..\n") | |
for filename in list_of_files_dirs: | |
if filename.endswith(".pdf"): | |
filepath = os.path.join(directory, filename) | |
with open(filepath, "rb") as f: | |
pdfFileObj = pdftotext.PDF(f, raw=True) | |
total_pages = len(pdfFileObj) | |
if read_max_pages and not total_pages < 10: | |
total_pages = read_max_pages | |
print(f"Reading max pages updated to: {total_pages}\n") | |
count_files += 1 | |
count_pages = 0 | |
print(f"{count_files} - {filename} - Completed") | |
while count_pages < total_pages: | |
pageObj = pdfFileObj[count_pages] | |
count_pages += 1 | |
overall_text += pageObj | |
else: | |
print("No PDF files found") | |
matched_text = re.findall( | |
pattern=re_pattern, | |
string=overall_text.lower(), | |
flags=re.MULTILINE | re.IGNORECASE, | |
) | |
occurrences = collections.Counter(matched_text) | |
if not occurrences: | |
return "No occurences of keywords found." | |
print("Counting completed. Enjoy!\n") | |
print(f"Keywords Ocurrences: \n{occurrences}\n") | |
if occurrences and show_graph: | |
fig, ax = plt.subplots() | |
y_list = list(occurrences.keys()) | |
x_list = list(occurrences.values()) | |
# get max occuring keyword | |
occurrences_dict = dict(occurrences) | |
max_value_key = max( | |
occurrences_dict.keys(), key=(lambda key: occurrences_dict[key]) | |
) | |
max_value = occurrences_dict[max_value_key] | |
if squeeze_data: | |
if max_value <= 1000 and max_value >= 500: | |
max_value = 1000 | |
deduct_by = 300 | |
else: | |
deduct_by = (0.3 / 10) * max_value | |
# squeezing the data by 30% if any value is greater than half of max_value | |
x_list = list( | |
[ | |
(int(item) / max_value) * 100 | |
if int(item) < max_value # / 2 | |
else ((int(item) - deduct_by) / max_value) * 100 | |
for item in occurrences.values() | |
] | |
) | |
else: | |
x_list = list( | |
[(int(item) / max_value) * 100 for item in occurrences.values()] | |
) | |
# check if keyword is not found | |
if len(keywords) > len(y_list): | |
zero_occ_keywords = [ | |
item for item in keywords if item.lower() not in y_list | |
] | |
print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}") | |
# capitalize keywords | |
keywords = [keyword.title() for keyword in y_list] | |
ax.barh(y_list, x_list, align="center", color=bar_color, zorder=3) | |
ax.grid(zorder=0) | |
ax.set_yticks(y_list) | |
ax.set_yticklabels(keywords) | |
ax.invert_yaxis() # labels read top-to-bottom | |
ax.set_ylabel("Keyword") | |
ax.set_title("Keyword Frequency Table") | |
plt.show() | |
else: | |
return occurrences | |
except Exception as e: | |
print(f"This is an error Message at: \n\n {traceback.format_exc()}") | |
# set directory where your PDF files are kept | |
# directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs" | |
directory = "./pdfs" | |
# Set your Keywords here, replace "dataset", "data", "deep learning" with your own. | |
keywords = [ | |
"ARIMA", | |
"CNN", | |
"GCN", | |
"LSTM", | |
"pems", | |
"accuracy", | |
"computational", | |
"deep learning", | |
"datasets", | |
"geographical similarity", | |
"Intelligent transportation system", | |
"Loop Detectors", | |
"neural network", | |
"Spatio-temporal", | |
"Sensors", | |
"Traffic data", | |
"Training data", | |
"traffic speed prediction", | |
"traffic flow prediction", | |
] | |
# Don't worry about this | |
get_word_frequency( | |
keywords=keywords, | |
directory=directory, | |
show_graph=True, | |
read_max_pages=False, | |
squeeze_data=True, | |
bar_color="#187bcd" | |
) | |
""" | |
1) Make sure you have install Python 3.x | |
2) Make virtual env wiuth command bellow | |
> python3 -m venv venv | |
3) Activate the virtual env with command below | |
> source venv/bin/activate | |
4) Copy below libs to requirements.txt | |
argcomplete==1.10.0 | |
beautifulsoup4==4.8.0 | |
certifi==2020.6.20 | |
chardet==3.0.4 | |
cycler==0.10.0 | |
docx2txt==0.8 | |
EbookLib==0.17.1 | |
extract-msg==0.23.1 | |
IMAPClient==2.1.0 | |
Js2Py==0.70 | |
kiwisolver==1.2.0 | |
lxml==4.5.2 | |
matplotlib==3.3.2 | |
numpy==1.19.2 | |
olefile==0.46 | |
pandas==1.1.2 | |
pdfminer==20191125 | |
pdfminer.six==20181108 | |
pdftotext==2.1.5 | |
Pillow==7.2.0 | |
pycryptodome==3.9.8 | |
pyjsparser==2.7.1 | |
pyparsing==2.4.7 | |
PyPDF2==1.26.0 | |
python-dateutil==2.8.1 | |
python-pptx==0.6.18 | |
pytz==2020.1 | |
six==1.12.0 | |
sortedcontainers==2.2.2 | |
soupsieve==2.0.1 | |
SpeechRecognition==3.8.1 | |
textract==1.6.3 | |
tzlocal==1.5.1 | |
xlrd==1.2.0 | |
XlsxWriter==1.3.6 | |
5) Install all the dependencies with command below | |
> pip install -r requirements.txt | |
6) Run below command to get the frequency chart | |
> python count_keywords_frequency.py | |
Parameters: | |
1) keywords: list | |
You can put your keywords in 'keywords' list | |
2) directory: str | |
You can put your directory where you pdf files are stored | |
3) show_graph: bool | |
You can disable showing graph with Flase and enable with True | |
4) read_max_pages: int | |
You can set maximum pages to be read, default is 10 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment