Last active
June 7, 2022 10:20
-
-
Save Lakshmi-1212/1463a49db5e41e54c498628f6145afa9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docs_path = 'sample_files' | |
ignore_words = ['Fig','like','e.g.','i.e.','one'] | |
all_keywords = [] | |
for filename in os.listdir(docs_path): | |
filepath = os.path.join(docs_path, filename) | |
if os.path.isfile(filepath) and filename.endswith('.pdf'): | |
print(f'Parsing file: {filename}') | |
try: | |
file_text = read_file(filepath) | |
keywords = extract_keywords(file_text,min_word_length = 3, ignore_words = ignore_words) | |
all_keywords.extend(keywords) | |
except: | |
print(f'ERROR!!! Unable to parse file: {filename}. Ignoring file!!') | |
print(f'Completed reading all pdf files in folder:{docs_path}') | |
create_word_cloud(all_keywords, bg = 'black', cmap = 'Set2',random_state = 100, width = 1000, height = 1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment