Skip to content

Instantly share code, notes, and snippets.

@iamsarthakjoshi
Last active September 29, 2020 08:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iamsarthakjoshi/09c676c343f2b0ab612b0da242fef515 to your computer and use it in GitHub Desktop.
Save iamsarthakjoshi/09c676c343f2b0ab612b0da242fef515 to your computer and use it in GitHub Desktop.
Count Keywords and Display Frequency Bar Chart
"""
You need to install poppler for pdftotext (for macOS)
> brew install pkg-config poppler
@JOSHI, Sarthak
PyPi: https://pypi.org/project/pdftotext/
"""
import os
import re
import traceback
import pdftotext
import collections
import matplotlib.pyplot as plt
from copy import copy
def get_word_frequency(
keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True
):
if not keywords:
return "No Keywords Given!"
if not directory:
return "No Directory Path Given!"
print(f"Reading max pages of: {read_max_pages}")
print(f"Show Graph: {show_graph}")
try:
_keywords = [rf"\b{keyword}\b" for keyword in keywords]
print("KEYS", _keywords)
re_pattern = rf"""\W*({"|".join(_keywords)})\W*"""
overall_text = ""
count_files = 0
print(f"Regex pattern: {re_pattern}")
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
filepath = os.path.join(directory, filename)
with open(filepath, "rb") as f:
pdfFileObj = pdftotext.PDF(f, raw=True)
total_pages = len(pdfFileObj)
if read_max_pages and not total_pages < 10:
total_pages = read_max_pages
print(f"Reading max pages updated to: {total_pages}\n")
count_files += 1
count_pages = 0
print(f"{count_files} - {filename} - Completed")
while count_pages < total_pages:
pageObj = pdfFileObj[count_pages]
count_pages += 1
overall_text += pageObj
else:
print("No PDF files found")
matched_text = re.findall(
pattern=re_pattern,
string=overall_text.lower(),
flags=re.MULTILINE | re.IGNORECASE,
)
occurrences = collections.Counter(matched_text)
if not occurrences:
return "No occurences of keywords found."
print(f"Ocurrences: {occurrences}")
if occurrences and show_graph:
fig, ax = plt.subplots()
y_list = list(occurrences.keys())
x_list = list(occurrences.values())
# get max occuring keyword
occurrences_dict = dict(occurrences)
max_value_key = max(
occurrences_dict.keys(), key=(lambda key: occurrences_dict[key])
)
max_value = occurrences_dict[max_value_key]
if squeeze_data:
if max_value <= 1000 and max_value >= 500:
max_value = 1000
deduct_by = 300
else:
deduct_by = (0.3 / 10) * max_value
# squeezing the data by 30% if any value is greater than half of max_value
x_list = list(
[
(int(item) / max_value) * 100
if int(item) < max_value #/ 2
else ((int(item) - deduct_by) / max_value) * 100
for item in occurrences.values()
]
)
else:
x_list = list(
[(int(item) / max_value) * 100 for item in occurrences.values()]
)
# check if keyword is not found
if len(keywords) > len(y_list):
zero_occ_keywords = [
item for item in keywords if item.lower() not in y_list
]
print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}")
# capitalize keywords
keywords = [keyword.title() for keyword in y_list]
ax.barh(y_list, x_list, align="center", color="#70AD47", zorder=3)
ax.grid(zorder=0)
ax.set_yticks(y_list)
ax.set_yticklabels(keywords)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_ylabel("Keyword")
ax.set_title("Keyword Frequency Table")
plt.show()
else:
return occurrences
except Exception as e:
print(f"This is an error Message at: \n\n {traceback.format_exc()}")
# set directory where your PDF files are kept
directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs"
# directory = "./pdfs"
# Set your Keywords here, replace "dataset", "data", "deep learning" with your own.
keywords = [
"ARIMA",
"CNN",
"GCN",
"LSTM",
"pems",
"accuracy",
"computational",
"deep learning",
"datasets",
"geographical similarity",
"Intelligent transportation system",
"Loop Detectors",
"neural network",
"Spatio-temporal",
"Sensors",
"Traffic data",
"Training data",
"traffic speed prediction",
"traffic flow prediction",
]
# Don't worry about this
get_word_frequency(
keywords=keywords, directory=directory, show_graph=True, read_max_pages=False, squeeze_data=True
)
"""
1) Make sure you have install Python 3.x
2) Make virtual env wiuth command bellow
> python3 -m venv venv
3) Activate the virtual env with command below
> source venv/bin/activate
4) Copy below libs to requirements.txt
argcomplete==1.10.0
beautifulsoup4==4.8.0
certifi==2020.6.20
chardet==3.0.4
cycler==0.10.0
docx2txt==0.8
EbookLib==0.17.1
extract-msg==0.23.1
IMAPClient==2.1.0
Js2Py==0.70
kiwisolver==1.2.0
lxml==4.5.2
matplotlib==3.3.2
numpy==1.19.2
olefile==0.46
pandas==1.1.2
pdfminer==20191125
pdfminer.six==20181108
pdftotext==2.1.5
Pillow==7.2.0
pycryptodome==3.9.8
pyjsparser==2.7.1
pyparsing==2.4.7
PyPDF2==1.26.0
python-dateutil==2.8.1
python-pptx==0.6.18
pytz==2020.1
six==1.12.0
sortedcontainers==2.2.2
soupsieve==2.0.1
SpeechRecognition==3.8.1
textract==1.6.3
tzlocal==1.5.1
xlrd==1.2.0
XlsxWriter==1.3.6
5) Install all the dependencies with command below
> pip install -r requirements.txt
6) Run below command to get the frequency chart
> python count_keywords_frequency.py
Parameters:
1) keywords: list
You can put your keywords in 'keywords' list
2) directory: str
You can put your directory where you pdf files are stored
3) show_graph: bool
You can disable showing graph with Flase and enable with True
4) read_max_pages: int
You can set maximum pages to be read, default is 10
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment