iamsarthakjoshi/count_keywords_frequency.py

## count_keywords_frequency.py
"""
You need to install poppler for pdftotext (for macOS)
> brew install pkg-config poppler

@JOSHI, Sarthak

PyPi: https://pypi.org/project/pdftotext/
"""

import os
import re
import traceback
import pdftotext
import collections
import matplotlib.pyplot as plt
from copy import copy


def get_word_frequency(
    keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True
):
    if not keywords:
        return "No Keywords Given!"
    if not directory:
        return "No Directory Path Given!"

    print(f"Reading max pages of: {read_max_pages}")
    print(f"Show Graph: {show_graph}")

    try:
        _keywords = [rf"\b{keyword}\b" for keyword in keywords]
        print("KEYS", _keywords)
        re_pattern = rf"""\W*({"|".join(_keywords)})\W*"""
        overall_text = ""
        count_files = 0

        print(f"Regex pattern: {re_pattern}")

        for filename in os.listdir(directory):
            if filename.endswith(".pdf"):
                filepath = os.path.join(directory, filename)
                with open(filepath, "rb") as f:
                    pdfFileObj = pdftotext.PDF(f, raw=True)
                    total_pages = len(pdfFileObj)
                    if read_max_pages and not total_pages < 10:
                        total_pages = read_max_pages
                        print(f"Reading max pages updated to: {total_pages}\n")

                    count_files += 1
                    count_pages = 0
                    print(f"{count_files} - {filename} - Completed")
                    while count_pages < total_pages:
                        pageObj = pdfFileObj[count_pages]
                        count_pages += 1
                        overall_text += pageObj
            else:
                print("No PDF files found")

        matched_text = re.findall(
            pattern=re_pattern,
            string=overall_text.lower(),
            flags=re.MULTILINE | re.IGNORECASE,
        )
        occurrences = collections.Counter(matched_text)

        if not occurrences:
            return "No occurences of keywords found."

        print(f"Ocurrences: {occurrences}")

        if occurrences and show_graph:
            fig, ax = plt.subplots()
            y_list = list(occurrences.keys())
            x_list = list(occurrences.values())

            # get max occuring keyword
            occurrences_dict = dict(occurrences)
            max_value_key = max(
                occurrences_dict.keys(), key=(lambda key: occurrences_dict[key])
            )
            max_value = occurrences_dict[max_value_key]

            if squeeze_data:
                if max_value <= 1000 and max_value >= 500:
                    max_value = 1000
                    deduct_by = 300
                else:
                    deduct_by = (0.3 / 10) * max_value

                # squeezing the data by 30% if any value is greater than half of max_value
                x_list = list(
                    [
                        (int(item) / max_value) * 100
                        if int(item) < max_value #/ 2
                        else ((int(item) - deduct_by) / max_value) * 100
                        for item in occurrences.values()
                    ]
                )
            else:
                x_list = list(
                    [(int(item) / max_value) * 100 for item in occurrences.values()]
                )

            # check if keyword is not found
            if len(keywords) > len(y_list):
                zero_occ_keywords = [
                    item for item in keywords if item.lower() not in y_list
                ]
                print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}")

            # capitalize keywords
            keywords = [keyword.title() for keyword in y_list]

            ax.barh(y_list, x_list, align="center", color="#70AD47", zorder=3)
            ax.grid(zorder=0)
            ax.set_yticks(y_list)
            ax.set_yticklabels(keywords)
            ax.invert_yaxis()  # labels read top-to-bottom
            ax.set_ylabel("Keyword")
            ax.set_title("Keyword Frequency Table")
            plt.show()
        else:
            return occurrences
    except Exception as e:
        print(f"This is an error Message at: \n\n {traceback.format_exc()}")


# set directory where your PDF files are kept
directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs"
# directory = "./pdfs"

# Set your Keywords here, replace "dataset", "data", "deep learning" with your own.
keywords = [
    "ARIMA",
    "CNN",
    "GCN",
    "LSTM",
    "pems",
    "accuracy",
    "computational",
    "deep learning",
    "datasets",
    "geographical similarity",
    "Intelligent transportation system",
    "Loop Detectors",
    "neural network",
    "Spatio-temporal",
    "Sensors",
    "Traffic data",
    "Training data",
    "traffic speed prediction",
    "traffic flow prediction",
]

# Don't worry about this
get_word_frequency(
    keywords=keywords, directory=directory, show_graph=True, read_max_pages=False, squeeze_data=True
)


"""
1) Make sure you have install Python 3.x
2) Make virtual env wiuth command bellow
    > python3 -m venv venv
3) Activate the virtual env with command below
    > source venv/bin/activate
4) Copy below libs to requirements.txt

argcomplete==1.10.0
beautifulsoup4==4.8.0
certifi==2020.6.20
chardet==3.0.4
cycler==0.10.0
docx2txt==0.8
EbookLib==0.17.1
extract-msg==0.23.1
IMAPClient==2.1.0
Js2Py==0.70
kiwisolver==1.2.0
lxml==4.5.2
matplotlib==3.3.2
numpy==1.19.2
olefile==0.46
pandas==1.1.2
pdfminer==20191125
pdfminer.six==20181108
pdftotext==2.1.5
Pillow==7.2.0
pycryptodome==3.9.8
pyjsparser==2.7.1
pyparsing==2.4.7
PyPDF2==1.26.0
python-dateutil==2.8.1
python-pptx==0.6.18
pytz==2020.1
six==1.12.0
sortedcontainers==2.2.2
soupsieve==2.0.1
SpeechRecognition==3.8.1
textract==1.6.3
tzlocal==1.5.1
xlrd==1.2.0
XlsxWriter==1.3.6


5) Install all the dependencies with command below
    > pip install -r requirements.txt
6) Run below command to get the frequency chart
    > python count_keywords_frequency.py

Parameters:
1) keywords: list
    You can put your keywords in 'keywords' list
2) directory: str
    You can put your directory where you pdf files are stored
3) show_graph: bool
    You can disable showing graph with Flase and enable with True
4) read_max_pages: int
    You can set maximum pages to be read, default is 10
"""
	"""
	You need to install poppler for pdftotext (for macOS)
	> brew install pkg-config poppler

	@JOSHI, Sarthak

	PyPi: https://pypi.org/project/pdftotext/
	"""

	import os
	import re
	import traceback
	import pdftotext
	import collections
	import matplotlib.pyplot as plt
	from copy import copy


	def get_word_frequency(
	keywords, directory, show_graph=False, read_max_pages=10, squeeze_data=True
	):
	if not keywords:
	return "No Keywords Given!"
	if not directory:
	return "No Directory Path Given!"

	print(f"Reading max pages of: {read_max_pages}")
	print(f"Show Graph: {show_graph}")

	try:
	_keywords = [rf"\b{keyword}\b" for keyword in keywords]
	print("KEYS", _keywords)
	re_pattern = rf"""\W({"\|".join(_keywords)})\W"""
	overall_text = ""
	count_files = 0

	print(f"Regex pattern: {re_pattern}")

	for filename in os.listdir(directory):
	if filename.endswith(".pdf"):
	filepath = os.path.join(directory, filename)
	with open(filepath, "rb") as f:
	pdfFileObj = pdftotext.PDF(f, raw=True)
	total_pages = len(pdfFileObj)
	if read_max_pages and not total_pages < 10:
	total_pages = read_max_pages
	print(f"Reading max pages updated to: {total_pages}\n")

	count_files += 1
	count_pages = 0
	print(f"{count_files} - {filename} - Completed")
	while count_pages < total_pages:
	pageObj = pdfFileObj[count_pages]
	count_pages += 1
	overall_text += pageObj
	else:
	print("No PDF files found")

	matched_text = re.findall(
	pattern=re_pattern,
	string=overall_text.lower(),
	flags=re.MULTILINE \| re.IGNORECASE,
	)
	occurrences = collections.Counter(matched_text)

	if not occurrences:
	return "No occurences of keywords found."

	print(f"Ocurrences: {occurrences}")

	if occurrences and show_graph:
	fig, ax = plt.subplots()
	y_list = list(occurrences.keys())
	x_list = list(occurrences.values())

	# get max occuring keyword
	occurrences_dict = dict(occurrences)
	max_value_key = max(
	occurrences_dict.keys(), key=(lambda key: occurrences_dict[key])
	)
	max_value = occurrences_dict[max_value_key]

	if squeeze_data:
	if max_value <= 1000 and max_value >= 500:
	max_value = 1000
	deduct_by = 300
	else:
	deduct_by = (0.3 / 10) * max_value

	# squeezing the data by 30% if any value is greater than half of max_value
	x_list = list(
	[
	(int(item) / max_value) * 100
	if int(item) < max_value #/ 2
	else ((int(item) - deduct_by) / max_value) * 100
	for item in occurrences.values()
	]
	)
	else:
	x_list = list(
	[(int(item) / max_value) * 100 for item in occurrences.values()]
	)

	# check if keyword is not found
	if len(keywords) > len(y_list):
	zero_occ_keywords = [
	item for item in keywords if item.lower() not in y_list
	]
	print(f"These keyword has 0 occurance: {', '.join(zero_occ_keywords)}")

	# capitalize keywords
	keywords = [keyword.title() for keyword in y_list]

	ax.barh(y_list, x_list, align="center", color="#70AD47", zorder=3)
	ax.grid(zorder=0)
	ax.set_yticks(y_list)
	ax.set_yticklabels(keywords)
	ax.invert_yaxis() # labels read top-to-bottom
	ax.set_ylabel("Keyword")
	ax.set_title("Keyword Frequency Table")
	plt.show()
	else:
	return occurrences
	except Exception as e:
	print(f"This is an error Message at: \n\n {traceback.format_exc()}")


	# set directory where your PDF files are kept
	directory = "/Users/sarthakjoshi/Data/Assignment/Emerging/202060/journals-pdfs"
	# directory = "./pdfs"

	# Set your Keywords here, replace "dataset", "data", "deep learning" with your own.
	keywords = [
	"ARIMA",
	"CNN",
	"GCN",
	"LSTM",
	"pems",
	"accuracy",
	"computational",
	"deep learning",
	"datasets",
	"geographical similarity",
	"Intelligent transportation system",
	"Loop Detectors",
	"neural network",
	"Spatio-temporal",
	"Sensors",
	"Traffic data",
	"Training data",
	"traffic speed prediction",
	"traffic flow prediction",
	]

	# Don't worry about this
	get_word_frequency(
	keywords=keywords, directory=directory, show_graph=True, read_max_pages=False, squeeze_data=True
	)


	"""
	1) Make sure you have install Python 3.x
	2) Make virtual env wiuth command bellow
	> python3 -m venv venv
	3) Activate the virtual env with command below
	> source venv/bin/activate
	4) Copy below libs to requirements.txt

	argcomplete==1.10.0
	beautifulsoup4==4.8.0
	certifi==2020.6.20
	chardet==3.0.4
	cycler==0.10.0
	docx2txt==0.8
	EbookLib==0.17.1
	extract-msg==0.23.1
	IMAPClient==2.1.0
	Js2Py==0.70
	kiwisolver==1.2.0
	lxml==4.5.2
	matplotlib==3.3.2
	numpy==1.19.2
	olefile==0.46
	pandas==1.1.2
	pdfminer==20191125
	pdfminer.six==20181108
	pdftotext==2.1.5
	Pillow==7.2.0
	pycryptodome==3.9.8
	pyjsparser==2.7.1
	pyparsing==2.4.7
	PyPDF2==1.26.0
	python-dateutil==2.8.1
	python-pptx==0.6.18
	pytz==2020.1
	six==1.12.0
	sortedcontainers==2.2.2
	soupsieve==2.0.1
	SpeechRecognition==3.8.1
	textract==1.6.3
	tzlocal==1.5.1
	xlrd==1.2.0
	XlsxWriter==1.3.6


	5) Install all the dependencies with command below
	> pip install -r requirements.txt
	6) Run below command to get the frequency chart
	> python count_keywords_frequency.py

	Parameters:
	1) keywords: list
	You can put your keywords in 'keywords' list
	2) directory: str
	You can put your directory where you pdf files are stored
	3) show_graph: bool
	You can disable showing graph with Flase and enable with True
	4) read_max_pages: int
	You can set maximum pages to be read, default is 10
	"""