Skip to content

Instantly share code, notes, and snippets.

@iamsarthakjoshi
Last active September 29, 2020 08:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamsarthakjoshi/db75ed8e2114f6cf18cf396a97445ede to your computer and use it in GitHub Desktop.
Save iamsarthakjoshi/db75ed8e2114f6cf18cf396a97445ede to your computer and use it in GitHub Desktop.
Count Keywords Frequency and Generate Chart with 'pdftotext' and 'matplotlib'
"""
You need to install poppler for pdftotext (for macOS)
> brew install pkg-config poppler
@JOSHI, Sarthak
PyPi: https://pypi.org/project/pdftotext/
"""
import os
import re
import traceback
import pdftotext
import collections
import matplotlib.pyplot as plt
from copy import copy
def get_word_frequency(
keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True, bar_color="#70AD47"
):
if not keywords:
return "No Keywords Given!"
if not directory:
return "No Directory Path Given!"
list_of_files_dirs = os.listdir(directory)
if len(list_of_files_dirs) < 1:
print("====> No PDF files found <====")
return "No PDF files found"
print(f"Reading max pages of: {read_max_pages}")
print(f"Show Graph: {show_graph}")
print(f"Total PDFS Found: {len(list_of_files_dirs)}\n")
try:
_keywords = [rf"\b{keyword}\b" for keyword in keywords]
print("KEYS", _keywords)
re_pattern = rf"""\W*({"|".join(_keywords)})\W*"""
overall_text = ""
count_files = 0
print(f"Regex pattern: {re_pattern}\n")
print("Counting started, please wait..\n")
for filename in list_of_files_dirs:
if filename.endswith(".pdf"):
filepath = os.path.join(directory, filename)
with open(filepath, "rb") as f:
pdfFileObj = pdftotext.PDF(f, raw=True)
total_pages = len(pdfFileObj)
if read_max_pages and not total_pages < 10:
total_pages = read_max_pages
print(f"Reading max pages updated to: {total_pages}\n")
count_files += 1
count_pages = 0
print(f"{count_files} - {filename} - Completed")
while count_pages < total_pages:
pageObj = pdfFileObj[count_pages]
count_pages += 1
overall_text += pageObj
else:
print("No PDF files found")
matched_text = re.findall(
pattern=re_pattern,
string=overall_text.lower(),
flags=re.MULTILINE | re.IGNORECASE,
)
occurrences = collections.Counter(matched_text)
if not occurrences:
return "No occurences of keywords found."
print("Counting completed. Enjoy!\n")
print(f"Keywords Ocurrences: \n{occurrences}\n")
if occurrences and show_graph:
fig, ax = plt.subplots()
y_list = list(occurrences.keys())
x_list = list(occurrences.values())
# get max occuring keyword
occurrences_dict = dict(occurrences)
max_value_key = max(
occurrences_dict.keys(), key=(lambda key: occurrences_dict[key])
)
max_value = occurrences_dict[max_value_key]
if squeeze_data:
if max_value <= 1000 and max_value >= 500:
max_value = 1000
deduct_by = 300
else:
deduct_by = (0.3 / 10) * max_value
# squeezing the data by 30% if any value is greater than half of max_value
x_list = list(
[
(int(item) / max_value) * 100
if int(item) < max_value # / 2
else ((int(item) - deduct_by) / max_value) * 100
for item in occurrences.values()
]
)
else:
x_list = list(
[(int(item) / max_value) * 100 for item in occurrences.values()]
)
# check if keyword is not found
if len(keywords) > len(y_list):
zero_occ_keywords = [
item for item in keywords if item.lower() not in y_list
]
print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}")
# capitalize keywords
keywords = [keyword.title() for keyword in y_list]
ax.barh(y_list, x_list, align="center", color=bar_color, zorder=3)
ax.grid(zorder=0)
ax.set_yticks(y_list)
ax.set_yticklabels(keywords)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_ylabel("Keyword")
ax.set_title("Keyword Frequency Table")
plt.show()
else:
return occurrences
except Exception as e:
print(f"This is an error Message at: \n\n {traceback.format_exc()}")
# set directory where your PDF files are kept
# directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs"
directory = "./pdfs"
# Set your Keywords here, replace "dataset", "data", "deep learning" with your own.
keywords = [
"ARIMA",
"CNN",
"GCN",
"LSTM",
"pems",
"accuracy",
"computational",
"deep learning",
"datasets",
"geographical similarity",
"Intelligent transportation system",
"Loop Detectors",
"neural network",
"Spatio-temporal",
"Sensors",
"Traffic data",
"Training data",
"traffic speed prediction",
"traffic flow prediction",
]
# Don't worry about this
get_word_frequency(
keywords=keywords,
directory=directory,
show_graph=True,
read_max_pages=False,
squeeze_data=True,
bar_color="#187bcd"
)
"""
1) Make sure you have install Python 3.x
2) Make virtual env wiuth command bellow
> python3 -m venv venv
3) Activate the virtual env with command below
> source venv/bin/activate
4) Copy below libs to requirements.txt
argcomplete==1.10.0
beautifulsoup4==4.8.0
certifi==2020.6.20
chardet==3.0.4
cycler==0.10.0
docx2txt==0.8
EbookLib==0.17.1
extract-msg==0.23.1
IMAPClient==2.1.0
Js2Py==0.70
kiwisolver==1.2.0
lxml==4.5.2
matplotlib==3.3.2
numpy==1.19.2
olefile==0.46
pandas==1.1.2
pdfminer==20191125
pdfminer.six==20181108
pdftotext==2.1.5
Pillow==7.2.0
pycryptodome==3.9.8
pyjsparser==2.7.1
pyparsing==2.4.7
PyPDF2==1.26.0
python-dateutil==2.8.1
python-pptx==0.6.18
pytz==2020.1
six==1.12.0
sortedcontainers==2.2.2
soupsieve==2.0.1
SpeechRecognition==3.8.1
textract==1.6.3
tzlocal==1.5.1
xlrd==1.2.0
XlsxWriter==1.3.6
5) Install all the dependencies with command below
> pip install -r requirements.txt
6) Run below command to get the frequency chart
> python count_keywords_frequency.py
Parameters:
1) keywords: list
You can put your keywords in 'keywords' list
2) directory: str
You can put your directory where you pdf files are stored
3) show_graph: bool
You can disable showing graph with Flase and enable with True
4) read_max_pages: int
You can set maximum pages to be read, default is 10
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment