Skip to content

Instantly share code, notes, and snippets.

@RyanJulyan
Created September 5, 2023 18:11
Show Gist options
  • Save RyanJulyan/4fd6e3463ebdedd9227b59e4f9a0fed0 to your computer and use it in GitHub Desktop.
Save RyanJulyan/4fd6e3463ebdedd9227b59e4f9a0fed0 to your computer and use it in GitHub Desktop.
This Python script provides a comprehensive utility for file metadata extraction and categorization. It uses various libraries like Tika, NLTK, and YAKE for text parsing, keyword extraction, and file type identification. The script also handles both local files and web URLs.
import pprint
from typing import Any, Dict, Iterable, List, Optional, Union
import re
import mimetypes
import pathlib
from tika import parser
import requests
import nltk
import inflect
import yake
nltk.download("wordnet")
from nltk.corpus import wordnet # noqa: E402
from nltk.stem import WordNetLemmatizer # noqa: E402
def extract_keywords_from_text(
text_content: str,
language: str = "en",
max_ngram_size: int = 1,
deduplication_threshold: float = 0.9,
numOfKeywords: int = 5,
features: Any = None,
) -> Iterable[Iterable[Union[str, float]]]:
"""
Extract keywords from a given text.
Parameters:
- text_content: The text to extract keywords from.
- language: The language of the text.
- max_ngram_size: The maximum size of the ngrams.
- deduplication_threshold: The deduplication threshold.
- numOfKeywords: The number of keywords to extract.
- features: Any additional features for the keyword extractor.
Returns:
- An iterable of tuples containing the keyword and its score.
"""
custom_kw_extractor = yake.KeywordExtractor(
lan=language,
n=max_ngram_size,
dedupLim=deduplication_threshold,
top=numOfKeywords,
features=features,
)
return custom_kw_extractor.extract_keywords(text_content)
def get_synonyms(word: str) -> Iterable[str]:
"""
Get synonyms of a given word.
Parameters:
- word: The word to find synonyms for.
Returns:
- An iterable of synonyms.
"""
synonyms = set()
for syn in wordnet.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name().lower())
return synonyms
def get_all_word_forms(
words: Iterable[str],
lemmatizer: Optional[WordNetLemmatizer] = None,
pluralizer: Optional[inflect.engine] = None,
) -> Iterable[str]:
"""
Get all forms of a list of words.
Parameters:
- words: The words to find all forms of.
- lemmatizer: An optional WordNetLemmatizer.
- pluralizer: An optional inflect engine.
Returns:
- An iterable of all word forms.
"""
if lemmatizer is None:
lemmatizer = WordNetLemmatizer()
if pluralizer is None:
pluralizer = inflect.engine()
all_word_forms = []
for word in words:
root = lemmatizer.lemmatize(word)
singular = (
pluralizer.singular_noun(word) if pluralizer.singular_noun(word) else word
)
plural = pluralizer.plural(word)
all_word_forms.extend([word, root, singular, plural])
return list(set(all_word_forms))
def get_data_from_web(url: str) -> Any:
"""
Fetch data from a given URL using Tika parser.
Parameters:
- url: The URL to fetch data from.
Returns:
- A dictionary containing the parsed data.
"""
response = requests.get(url)
results = parser.from_buffer(response.content)
return results
def get_data_from_given_path(file_path_or_url: str) -> Any:
"""
Fetch data from a given file path or URL using Tika parser.
Parameters:
- file_path_or_url: The file path or URL to fetch data from.
Returns:
- A dictionary containing the parsed data.
"""
results = parser.from_file(file_path_or_url)
return results
def convert_doc_to_plain_text(file_path_or_url: str) -> str:
"""
Convert a document to plain text.
Parameters:
- file_path_or_url: The file path or URL of the document.
Returns:
- A string containing the plain text content of the document.
"""
error_count: int = 0
try:
results = get_data_from_web(file_path_or_url)
except Exception as e:
print("Error get_data_from_web:")
print(e)
print()
error_count += 1
try:
results = get_data_from_given_path(file_path_or_url)
except Exception as e:
print("Error get_data_from_given_path:")
print(e)
print()
error_count += 1
if error_count == 2:
raise Exception("could not extract content from URL or Path")
text_content = str(results["content"].strip())
return text_content
def get_text_tags(
text_content: str, keywords: Optional[Iterable[str]] = None
) -> Iterable[str]:
"""
Get tags for a given text based on keywords and their synonyms.
Parameters:
- text_content: The text content.
- keywords: An optional list of keywords to look for.
Returns:
- A list of tags.
"""
if keywords is None:
keywords = []
tags = []
for keyword in keywords:
keyword_synonyms = get_synonyms(keyword)
if keyword.lower() in text_content or any(
syn in text_content for syn in keyword_synonyms
):
tags.append(keyword)
tags.extend(keyword_synonyms)
return list(set(tags))
def is_link(item: str) -> bool:
"""
Check if a given item is a link.
Parameters:
- item: The item to check.
Returns:
- True if the item is a link, False otherwise.
"""
return re.match(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
item,
)
def categorize_file(file_path_or_url: str) -> str:
"""
Categorize a given file based on its type.
Parameters:
- file_path_or_url: The file path or URL to categorize.
Returns:
- A string representing the category of the file.
"""
mime_type, _ = mimetypes.guess_type(file_path_or_url)
if mime_type:
if "image" in mime_type:
return "media"
elif "text/html" in mime_type:
return "links"
elif "application" in mime_type or "text" in mime_type:
return "documents"
else:
return "other"
elif isinstance(file_path_or_url, str) and is_link(file_path_or_url):
return "links"
return "other"
def get_file_metdata(
file_path_or_url: str,
keywords: Optional[List[str]] = None,
language: str = "en",
max_ngram_size: int = 1,
deduplication_threshold: float = 0.9,
numOfKeywords: int = 5,
features: Any = None,
) -> Dict[str, Union[str, Iterable[str]]]:
"""
Get metadata for a given file.
Parameters:
- file_path_or_url: The file path or URL to get metadata for.
- keywords: An optional list of keywords to include.
- language: The language for keyword extraction.
- max_ngram_size: The maximum n-gram size for keyword extraction.
- deduplication_threshold: The deduplication threshold for keyword extraction.
- numOfKeywords: The number of keywords to extract.
- features: Additional features for keyword extraction.
Returns:
- A dictionary containing the metadata for the file.
"""
if keywords is None:
keywords = []
file_category = categorize_file(file_path_or_url=file_path_or_url)
text_content = convert_doc_to_plain_text(file_path_or_url=file_path_or_url)
extracted_keywords = extract_keywords_from_text(
text_content=text_content,
language=language,
max_ngram_size=max_ngram_size,
deduplication_threshold=deduplication_threshold,
numOfKeywords=numOfKeywords,
features=features,
)
extracted_keywords_only = [text for text, _ in extracted_keywords]
keywords.extend(extracted_keywords_only)
keywords = list(set(keywords))
keywords_all_word_forms = get_all_word_forms(words=keywords)
file_tags = get_text_tags(
text_content=text_content, keywords=keywords_all_word_forms
)
file_extension = pathlib.Path(file_path_or_url).suffix
return {
"file_path_or_url": str(file_path_or_url),
"file_extension": file_extension,
"file_category": file_category,
"file_tags": file_tags,
}
if __name__ == "__main__":
print("categorize_file:")
# Test the function
print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media'
print("website.html: ", categorize_file("website.html")) # Output: 'links'
print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents'
print(
"presentation.pptx: ", categorize_file("presentation.pptx")
) # Output: 'documents'
print(
"spreadsheet.xlsx: ", categorize_file("spreadsheet.xlsx")
) # Output: 'documents'
print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'unknown'
print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media'
print("website.html: ", categorize_file("website.html")) # Output: 'links'
print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents'
print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'other'
print(
"https://www.youtube.com/watch?v=pebwHmibla4&t=378s: ",
categorize_file("https://www.youtube.com/watch?v=pebwHmibla4&t=378s"),
) # Output: 'links'
print()
print("get_synonyms:")
print(get_synonyms(categorize_file("image.jpg")))
print()
print("extract_keywords_from_text:")
print(extract_keywords_from_text("rule"))
print()
print("get_all_word_forms:")
print(get_all_word_forms(words=["rule"]))
print()
print("get_file_metdata:")
pprint.pprint(get_file_metdata("Rules Engine 002.pptx", keywords=["rule"]))
print()
inflect
nltk
requests
tika
yake
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment