Created
September 5, 2023 18:11
-
-
Save RyanJulyan/4fd6e3463ebdedd9227b59e4f9a0fed0 to your computer and use it in GitHub Desktop.
This Python script provides a comprehensive utility for file metadata extraction and categorization. It uses various libraries like Tika, NLTK, and YAKE for text parsing, keyword extraction, and file type identification. The script also handles both local files and web URLs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
from typing import Any, Dict, Iterable, List, Optional, Union | |
import re | |
import mimetypes | |
import pathlib | |
from tika import parser | |
import requests | |
import nltk | |
import inflect | |
import yake | |
nltk.download("wordnet") | |
from nltk.corpus import wordnet # noqa: E402 | |
from nltk.stem import WordNetLemmatizer # noqa: E402 | |
def extract_keywords_from_text( | |
text_content: str, | |
language: str = "en", | |
max_ngram_size: int = 1, | |
deduplication_threshold: float = 0.9, | |
numOfKeywords: int = 5, | |
features: Any = None, | |
) -> Iterable[Iterable[Union[str, float]]]: | |
""" | |
Extract keywords from a given text. | |
Parameters: | |
- text_content: The text to extract keywords from. | |
- language: The language of the text. | |
- max_ngram_size: The maximum size of the ngrams. | |
- deduplication_threshold: The deduplication threshold. | |
- numOfKeywords: The number of keywords to extract. | |
- features: Any additional features for the keyword extractor. | |
Returns: | |
- An iterable of tuples containing the keyword and its score. | |
""" | |
custom_kw_extractor = yake.KeywordExtractor( | |
lan=language, | |
n=max_ngram_size, | |
dedupLim=deduplication_threshold, | |
top=numOfKeywords, | |
features=features, | |
) | |
return custom_kw_extractor.extract_keywords(text_content) | |
def get_synonyms(word: str) -> Iterable[str]: | |
""" | |
Get synonyms of a given word. | |
Parameters: | |
- word: The word to find synonyms for. | |
Returns: | |
- An iterable of synonyms. | |
""" | |
synonyms = set() | |
for syn in wordnet.synsets(word): | |
for lemma in syn.lemmas(): | |
synonyms.add(lemma.name().lower()) | |
return synonyms | |
def get_all_word_forms( | |
words: Iterable[str], | |
lemmatizer: Optional[WordNetLemmatizer] = None, | |
pluralizer: Optional[inflect.engine] = None, | |
) -> Iterable[str]: | |
""" | |
Get all forms of a list of words. | |
Parameters: | |
- words: The words to find all forms of. | |
- lemmatizer: An optional WordNetLemmatizer. | |
- pluralizer: An optional inflect engine. | |
Returns: | |
- An iterable of all word forms. | |
""" | |
if lemmatizer is None: | |
lemmatizer = WordNetLemmatizer() | |
if pluralizer is None: | |
pluralizer = inflect.engine() | |
all_word_forms = [] | |
for word in words: | |
root = lemmatizer.lemmatize(word) | |
singular = ( | |
pluralizer.singular_noun(word) if pluralizer.singular_noun(word) else word | |
) | |
plural = pluralizer.plural(word) | |
all_word_forms.extend([word, root, singular, plural]) | |
return list(set(all_word_forms)) | |
def get_data_from_web(url: str) -> Any: | |
""" | |
Fetch data from a given URL using Tika parser. | |
Parameters: | |
- url: The URL to fetch data from. | |
Returns: | |
- A dictionary containing the parsed data. | |
""" | |
response = requests.get(url) | |
results = parser.from_buffer(response.content) | |
return results | |
def get_data_from_given_path(file_path_or_url: str) -> Any: | |
""" | |
Fetch data from a given file path or URL using Tika parser. | |
Parameters: | |
- file_path_or_url: The file path or URL to fetch data from. | |
Returns: | |
- A dictionary containing the parsed data. | |
""" | |
results = parser.from_file(file_path_or_url) | |
return results | |
def convert_doc_to_plain_text(file_path_or_url: str) -> str: | |
""" | |
Convert a document to plain text. | |
Parameters: | |
- file_path_or_url: The file path or URL of the document. | |
Returns: | |
- A string containing the plain text content of the document. | |
""" | |
error_count: int = 0 | |
try: | |
results = get_data_from_web(file_path_or_url) | |
except Exception as e: | |
print("Error get_data_from_web:") | |
print(e) | |
print() | |
error_count += 1 | |
try: | |
results = get_data_from_given_path(file_path_or_url) | |
except Exception as e: | |
print("Error get_data_from_given_path:") | |
print(e) | |
print() | |
error_count += 1 | |
if error_count == 2: | |
raise Exception("could not extract content from URL or Path") | |
text_content = str(results["content"].strip()) | |
return text_content | |
def get_text_tags( | |
text_content: str, keywords: Optional[Iterable[str]] = None | |
) -> Iterable[str]: | |
""" | |
Get tags for a given text based on keywords and their synonyms. | |
Parameters: | |
- text_content: The text content. | |
- keywords: An optional list of keywords to look for. | |
Returns: | |
- A list of tags. | |
""" | |
if keywords is None: | |
keywords = [] | |
tags = [] | |
for keyword in keywords: | |
keyword_synonyms = get_synonyms(keyword) | |
if keyword.lower() in text_content or any( | |
syn in text_content for syn in keyword_synonyms | |
): | |
tags.append(keyword) | |
tags.extend(keyword_synonyms) | |
return list(set(tags)) | |
def is_link(item: str) -> bool: | |
""" | |
Check if a given item is a link. | |
Parameters: | |
- item: The item to check. | |
Returns: | |
- True if the item is a link, False otherwise. | |
""" | |
return re.match( | |
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", | |
item, | |
) | |
def categorize_file(file_path_or_url: str) -> str: | |
""" | |
Categorize a given file based on its type. | |
Parameters: | |
- file_path_or_url: The file path or URL to categorize. | |
Returns: | |
- A string representing the category of the file. | |
""" | |
mime_type, _ = mimetypes.guess_type(file_path_or_url) | |
if mime_type: | |
if "image" in mime_type: | |
return "media" | |
elif "text/html" in mime_type: | |
return "links" | |
elif "application" in mime_type or "text" in mime_type: | |
return "documents" | |
else: | |
return "other" | |
elif isinstance(file_path_or_url, str) and is_link(file_path_or_url): | |
return "links" | |
return "other" | |
def get_file_metdata( | |
file_path_or_url: str, | |
keywords: Optional[List[str]] = None, | |
language: str = "en", | |
max_ngram_size: int = 1, | |
deduplication_threshold: float = 0.9, | |
numOfKeywords: int = 5, | |
features: Any = None, | |
) -> Dict[str, Union[str, Iterable[str]]]: | |
""" | |
Get metadata for a given file. | |
Parameters: | |
- file_path_or_url: The file path or URL to get metadata for. | |
- keywords: An optional list of keywords to include. | |
- language: The language for keyword extraction. | |
- max_ngram_size: The maximum n-gram size for keyword extraction. | |
- deduplication_threshold: The deduplication threshold for keyword extraction. | |
- numOfKeywords: The number of keywords to extract. | |
- features: Additional features for keyword extraction. | |
Returns: | |
- A dictionary containing the metadata for the file. | |
""" | |
if keywords is None: | |
keywords = [] | |
file_category = categorize_file(file_path_or_url=file_path_or_url) | |
text_content = convert_doc_to_plain_text(file_path_or_url=file_path_or_url) | |
extracted_keywords = extract_keywords_from_text( | |
text_content=text_content, | |
language=language, | |
max_ngram_size=max_ngram_size, | |
deduplication_threshold=deduplication_threshold, | |
numOfKeywords=numOfKeywords, | |
features=features, | |
) | |
extracted_keywords_only = [text for text, _ in extracted_keywords] | |
keywords.extend(extracted_keywords_only) | |
keywords = list(set(keywords)) | |
keywords_all_word_forms = get_all_word_forms(words=keywords) | |
file_tags = get_text_tags( | |
text_content=text_content, keywords=keywords_all_word_forms | |
) | |
file_extension = pathlib.Path(file_path_or_url).suffix | |
return { | |
"file_path_or_url": str(file_path_or_url), | |
"file_extension": file_extension, | |
"file_category": file_category, | |
"file_tags": file_tags, | |
} | |
if __name__ == "__main__": | |
print("categorize_file:") | |
# Test the function | |
print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media' | |
print("website.html: ", categorize_file("website.html")) # Output: 'links' | |
print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents' | |
print( | |
"presentation.pptx: ", categorize_file("presentation.pptx") | |
) # Output: 'documents' | |
print( | |
"spreadsheet.xlsx: ", categorize_file("spreadsheet.xlsx") | |
) # Output: 'documents' | |
print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'unknown' | |
print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media' | |
print("website.html: ", categorize_file("website.html")) # Output: 'links' | |
print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents' | |
print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'other' | |
print( | |
"https://www.youtube.com/watch?v=pebwHmibla4&t=378s: ", | |
categorize_file("https://www.youtube.com/watch?v=pebwHmibla4&t=378s"), | |
) # Output: 'links' | |
print() | |
print("get_synonyms:") | |
print(get_synonyms(categorize_file("image.jpg"))) | |
print() | |
print("extract_keywords_from_text:") | |
print(extract_keywords_from_text("rule")) | |
print() | |
print("get_all_word_forms:") | |
print(get_all_word_forms(words=["rule"])) | |
print() | |
print("get_file_metdata:") | |
pprint.pprint(get_file_metdata("Rules Engine 002.pptx", keywords=["rule"])) | |
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
inflect | |
nltk | |
requests | |
tika | |
yake |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment