Created
June 21, 2020 11:40
Identify the language of documents.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ----------------------------------------------------------------------------- | |
# Import libraries | |
import nltk | |
from nltk.tokenize import wordpunct_tokenize | |
from nltk.corpus import stopwords | |
nltk.download('stopwords') | |
import PyPDF2 | |
import os | |
path = "Input/" | |
import email | |
import docx2txt | |
import pytesseract | |
import pdf2image | |
from PIL import Image | |
import fitz # pip install PyMuPDF | |
from time import time | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Language detector | |
def lang_detector(test_data): | |
lang_ratio ={} | |
tokenize_string = wordpunct_tokenize(test_data) | |
for language in stopwords.fileids(): | |
set1 = set(stopwords.words(language)) | |
set2 = set(tokenize_string) | |
total_set = set1.intersection(set2) | |
lang_ratio[language] = len(total_set) | |
print(lang_ratio[language]) | |
final_lang = max(lang_ratio, key=lang_ratio.get) | |
max_lang_count = max(lang_ratio.values()) | |
if max_lang_count == 0: | |
print("Method1 :") | |
from langdetect import detect | |
lang = detect(test_data) | |
print(lang) | |
else: | |
print("Method2 :") | |
print("The language of the text file is :" + str(final_lang)) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Extract digital pdf information | |
def pdf_information_extractor(files): | |
final_string = "" | |
file_obj = open(path + str(files), 'rb') | |
file_read_obj = PyPDF2.PdfFileReader(file_obj) | |
page_obj = file_read_obj.getPage(0) | |
final_string = page_obj.extractText() | |
print(final_string) | |
lang_detector(final_string) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Extract image information for png, jpg, jpeg, tiff | |
def img_information_extractor(files): | |
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
text = pytesseract.image_to_string(path + str(files)) | |
print(text) | |
lang_detector(text) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Extract image of non digital and free memory | |
def img_information_extractor_for_cache(): | |
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' | |
file_= "Cache/outfile.png" | |
text = pytesseract.image_to_string(file_) | |
print(text) | |
lang_detector(text) | |
os.remove(file_) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Non digital to Img and then Image information | |
def non_digital_pdf_information_extractor(files): | |
doc = fitz.open(files) | |
page = doc.loadPage(0) #number of page | |
pix = page.getPixmap() | |
output = "outfile.png" | |
pix.writePNG("Cache/"+output) | |
img_information_extractor_for_cache() | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Digital and Non digital classifier | |
def digital_nondigital_checker(files): | |
pdf_reader = PyPDF2.PdfFileReader(path + str(files), 'rb') | |
total_pages = pdf_reader.getNumPages() | |
print("Number of pages in pdf : {}".format(total_pages)) | |
curr_page = 0 | |
page_type_flag = False | |
while total_pages: | |
page_data = pdf_reader.getPage(curr_page) | |
if '/Font' in page_data['/Resources']: | |
print("The pdf is digital.") | |
pdf_information_extractor(files) | |
total_pages -= 1 | |
curr_page += 1 | |
if(page_type_flag): | |
print("The pdf is mixed.") | |
break | |
else: | |
print("The pdf is non digital.") | |
non_digital_pdf_information_extractor(files) | |
total_pages -= 1 | |
curr_page += 1 | |
page_type_flag = True | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Mail information | |
def msg_information_extractor(files): | |
mail_file = open(path+str(files), 'r') | |
subject_line = "" | |
for item in mail_file: | |
if item.startswith("Subject"): | |
subject_line = item[8:] | |
lang_detector(subject_line) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# htm and html information | |
def htm_infomation_extractor(files): | |
mail_file = open(path+str(files), 'r+', encoding="utf8") | |
htm_string = "" | |
for item in mail_file: | |
htm_string += item | |
lang_detector(htm_string) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# docx information | |
def docx_information_extractor(files): | |
my_text = docx2txt.process(path + str(files)) | |
print(my_text[0:100]) | |
lang_detector(my_text[0:100]) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# text information | |
def text_information_extraction(files): | |
final_string = "" | |
test_data = open(path + str(files), 'r+') | |
for item in test_data: | |
final_string += item | |
final_string = final_string.lower() | |
print(final_string) | |
lang_detector(final_string) | |
# ----------------------------------------------------------------------------- | |
# ----------------------------------------------------------------------------- | |
# Take input | |
def input_from_folder(): | |
start_time = time() | |
for files in os.listdir(path): | |
if files.endswith(".png"): | |
#img_information_extractor(files) | |
pass | |
elif files.endswith(".jpg"): | |
#img_information_extractor(files) | |
pass | |
elif files.endswith(".jpeg"): | |
#img_information_extractor(files) | |
pass | |
elif files.endswith(".tif"): | |
#img_information_extractor(files) | |
pass | |
elif files.endswith(".htm"): | |
#htm_infomation_extractor(files) | |
pass | |
elif files.endswith(".html"): | |
#htm_infomation_extractor(files) | |
pass | |
elif files.endswith(".doc"): | |
#docx_information_extractor(files) | |
pass | |
elif files.endswith(".docx"): | |
#docx_information_extractor(files) | |
pass | |
elif files.endswith(".pdf"): | |
digital_nondigital_checker(files) | |
elif files.endswith(".txt"): | |
#text_information_extraction(files) | |
pass | |
elif files.endswith(".msg"): | |
#msg_information_extractor(files) | |
pass | |
else: | |
print("Not matching with the file criteria.....") | |
end_time = time() | |
print("Time required for execution : {}".format(end_time - start_time)) | |
# ----------------------------------------------------------------------------- | |
# Call me | |
if(__name__ == '__main__'): | |
input_from_folder() | |
# ----------------------------------------------------------------------------- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment