RyanJulyan/get_file_metdata.py

## get_file_metdata.py
import pprint
from typing import Any, Dict, Iterable, List, Optional, Union
import re
import mimetypes
import pathlib

from tika import parser
import requests
import nltk
import inflect
import yake

nltk.download("wordnet")
from nltk.corpus import wordnet  # noqa: E402
from nltk.stem import WordNetLemmatizer  # noqa: E402


def extract_keywords_from_text(
    text_content: str,
    language: str = "en",
    max_ngram_size: int = 1,
    deduplication_threshold: float = 0.9,
    numOfKeywords: int = 5,
    features: Any = None,
) -> Iterable[Iterable[Union[str, float]]]:
    """
    Extract keywords from a given text.

    Parameters:
    - text_content: The text to extract keywords from.
    - language: The language of the text.
    - max_ngram_size: The maximum size of the ngrams.
    - deduplication_threshold: The deduplication threshold.
    - numOfKeywords: The number of keywords to extract.
    - features: Any additional features for the keyword extractor.

    Returns:
    - An iterable of tuples containing the keyword and its score.
    """
    custom_kw_extractor = yake.KeywordExtractor(
        lan=language,
        n=max_ngram_size,
        dedupLim=deduplication_threshold,
        top=numOfKeywords,
        features=features,
    )

    return custom_kw_extractor.extract_keywords(text_content)


def get_synonyms(word: str) -> Iterable[str]:
    """
    Get synonyms of a given word.

    Parameters:
    - word: The word to find synonyms for.

    Returns:
    - An iterable of synonyms.
    """
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return synonyms


def get_all_word_forms(
    words: Iterable[str],
    lemmatizer: Optional[WordNetLemmatizer] = None,
    pluralizer: Optional[inflect.engine] = None,
) -> Iterable[str]:
    """
    Get all forms of a list of words.

    Parameters:
    - words: The words to find all forms of.
    - lemmatizer: An optional WordNetLemmatizer.
    - pluralizer: An optional inflect engine.

    Returns:
    - An iterable of all word forms.
    """
    if lemmatizer is None:
        lemmatizer = WordNetLemmatizer()
    if pluralizer is None:
        pluralizer = inflect.engine()

    all_word_forms = []

    for word in words:
        root = lemmatizer.lemmatize(word)
        singular = (
            pluralizer.singular_noun(word) if pluralizer.singular_noun(word) else word
        )
        plural = pluralizer.plural(word)

        all_word_forms.extend([word, root, singular, plural])

    return list(set(all_word_forms))


def get_data_from_web(url: str) -> Any:
    """
    Fetch data from a given URL using Tika parser.

    Parameters:
    - url: The URL to fetch data from.

    Returns:
    - A dictionary containing the parsed data.
    """
    response = requests.get(url)
    results = parser.from_buffer(response.content)
    return results


def get_data_from_given_path(file_path_or_url: str) -> Any:
    """
    Fetch data from a given file path or URL using Tika parser.

    Parameters:
    - file_path_or_url: The file path or URL to fetch data from.

    Returns:
    - A dictionary containing the parsed data.
    """
    results = parser.from_file(file_path_or_url)
    return results


def convert_doc_to_plain_text(file_path_or_url: str) -> str:
    """
    Convert a document to plain text.

    Parameters:
    - file_path_or_url: The file path or URL of the document.

    Returns:
    - A string containing the plain text content of the document.
    """
    error_count: int = 0
    try:
        results = get_data_from_web(file_path_or_url)
    except Exception as e:
        print("Error get_data_from_web:")
        print(e)
        print()
        error_count += 1
    try:
        results = get_data_from_given_path(file_path_or_url)
    except Exception as e:
        print("Error get_data_from_given_path:")
        print(e)
        print()
        error_count += 1

    if error_count == 2:
        raise Exception("could not extract content from URL or Path")

    text_content = str(results["content"].strip())
    return text_content


def get_text_tags(
    text_content: str, keywords: Optional[Iterable[str]] = None
) -> Iterable[str]:
    """
    Get tags for a given text based on keywords and their synonyms.

    Parameters:
    - text_content: The text content.
    - keywords: An optional list of keywords to look for.

    Returns:
    - A list of tags.
    """
    if keywords is None:
        keywords = []
    tags = []

    for keyword in keywords:
        keyword_synonyms = get_synonyms(keyword)
        if keyword.lower() in text_content or any(
            syn in text_content for syn in keyword_synonyms
        ):
            tags.append(keyword)
            tags.extend(keyword_synonyms)

    return list(set(tags))


def is_link(item: str) -> bool:
    """
    Check if a given item is a link.

    Parameters:
    - item: The item to check.

    Returns:
    - True if the item is a link, False otherwise.
    """
    return re.match(
        r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
        item,
    )


def categorize_file(file_path_or_url: str) -> str:
    """
    Categorize a given file based on its type.

    Parameters:
    - file_path_or_url: The file path or URL to categorize.

    Returns:
    - A string representing the category of the file.
    """
    mime_type, _ = mimetypes.guess_type(file_path_or_url)

    if mime_type:
        if "image" in mime_type:
            return "media"
        elif "text/html" in mime_type:
            return "links"
        elif "application" in mime_type or "text" in mime_type:
            return "documents"
        else:
            return "other"
    elif isinstance(file_path_or_url, str) and is_link(file_path_or_url):
        return "links"
    return "other"


def get_file_metdata(
    file_path_or_url: str,
    keywords: Optional[List[str]] = None,
    language: str = "en",
    max_ngram_size: int = 1,
    deduplication_threshold: float = 0.9,
    numOfKeywords: int = 5,
    features: Any = None,
) -> Dict[str, Union[str, Iterable[str]]]:
    """
    Get metadata for a given file.

    Parameters:
    - file_path_or_url: The file path or URL to get metadata for.
    - keywords: An optional list of keywords to include.
    - language: The language for keyword extraction.
    - max_ngram_size: The maximum n-gram size for keyword extraction.
    - deduplication_threshold: The deduplication threshold for keyword extraction.
    - numOfKeywords: The number of keywords to extract.
    - features: Additional features for keyword extraction.

    Returns:
    - A dictionary containing the metadata for the file.
    """
    if keywords is None:
        keywords = []
    file_category = categorize_file(file_path_or_url=file_path_or_url)
    text_content = convert_doc_to_plain_text(file_path_or_url=file_path_or_url)
    extracted_keywords = extract_keywords_from_text(
        text_content=text_content,
        language=language,
        max_ngram_size=max_ngram_size,
        deduplication_threshold=deduplication_threshold,
        numOfKeywords=numOfKeywords,
        features=features,
    )
    extracted_keywords_only = [text for text, _ in extracted_keywords]
    keywords.extend(extracted_keywords_only)
    keywords = list(set(keywords))
    keywords_all_word_forms = get_all_word_forms(words=keywords)
    file_tags = get_text_tags(
        text_content=text_content, keywords=keywords_all_word_forms
    )
    file_extension = pathlib.Path(file_path_or_url).suffix

    return {
        "file_path_or_url": str(file_path_or_url),
        "file_extension": file_extension,
        "file_category": file_category,
        "file_tags": file_tags,
    }


if __name__ == "__main__":
    print("categorize_file:")
    # Test the function
    print("image.jpg: ", categorize_file("image.jpg"))  # Output: 'media'
    print("website.html: ", categorize_file("website.html"))  # Output: 'links'
    print("document.pdf: ", categorize_file("document.pdf"))  # Output: 'documents'
    print(
        "presentation.pptx: ", categorize_file("presentation.pptx")
    )  # Output: 'documents'
    print(
        "spreadsheet.xlsx: ", categorize_file("spreadsheet.xlsx")
    )  # Output: 'documents'
    print("unknown.xyz: ", categorize_file("unknown.xyz"))  # Output: 'unknown'
    print("image.jpg: ", categorize_file("image.jpg"))  # Output: 'media'
    print("website.html: ", categorize_file("website.html"))  # Output: 'links'
    print("document.pdf: ", categorize_file("document.pdf"))  # Output: 'documents'
    print("unknown.xyz: ", categorize_file("unknown.xyz"))  # Output: 'other'
    print(
        "https://www.youtube.com/watch?v=pebwHmibla4&t=378s: ",
        categorize_file("https://www.youtube.com/watch?v=pebwHmibla4&t=378s"),
    )  # Output: 'links'
    print()

    print("get_synonyms:")
    print(get_synonyms(categorize_file("image.jpg")))
    print()

    print("extract_keywords_from_text:")
    print(extract_keywords_from_text("rule"))
    print()

    print("get_all_word_forms:")
    print(get_all_word_forms(words=["rule"]))
    print()

    print("get_file_metdata:")
    pprint.pprint(get_file_metdata("Rules Engine 002.pptx", keywords=["rule"]))
    print()

## requirements.txt
inflect
nltk
requests
tika
yake
	import pprint
	from typing import Any, Dict, Iterable, List, Optional, Union
	import re
	import mimetypes
	import pathlib

	from tika import parser
	import requests
	import nltk
	import inflect
	import yake

	nltk.download("wordnet")
	from nltk.corpus import wordnet # noqa: E402
	from nltk.stem import WordNetLemmatizer # noqa: E402


	def extract_keywords_from_text(
	text_content: str,
	language: str = "en",
	max_ngram_size: int = 1,
	deduplication_threshold: float = 0.9,
	numOfKeywords: int = 5,
	features: Any = None,
	) -> Iterable[Iterable[Union[str, float]]]:
	"""
	Extract keywords from a given text.

	Parameters:
	- text_content: The text to extract keywords from.
	- language: The language of the text.
	- max_ngram_size: The maximum size of the ngrams.
	- deduplication_threshold: The deduplication threshold.
	- numOfKeywords: The number of keywords to extract.
	- features: Any additional features for the keyword extractor.

	Returns:
	- An iterable of tuples containing the keyword and its score.
	"""
	custom_kw_extractor = yake.KeywordExtractor(
	lan=language,
	n=max_ngram_size,
	dedupLim=deduplication_threshold,
	top=numOfKeywords,
	features=features,
	)

	return custom_kw_extractor.extract_keywords(text_content)


	def get_synonyms(word: str) -> Iterable[str]:
	"""
	Get synonyms of a given word.

	Parameters:
	- word: The word to find synonyms for.

	Returns:
	- An iterable of synonyms.
	"""
	synonyms = set()
	for syn in wordnet.synsets(word):
	for lemma in syn.lemmas():
	synonyms.add(lemma.name().lower())
	return synonyms


	def get_all_word_forms(
	words: Iterable[str],
	lemmatizer: Optional[WordNetLemmatizer] = None,
	pluralizer: Optional[inflect.engine] = None,
	) -> Iterable[str]:
	"""
	Get all forms of a list of words.

	Parameters:
	- words: The words to find all forms of.
	- lemmatizer: An optional WordNetLemmatizer.
	- pluralizer: An optional inflect engine.

	Returns:
	- An iterable of all word forms.
	"""
	if lemmatizer is None:
	lemmatizer = WordNetLemmatizer()
	if pluralizer is None:
	pluralizer = inflect.engine()

	all_word_forms = []

	for word in words:
	root = lemmatizer.lemmatize(word)
	singular = (
	pluralizer.singular_noun(word) if pluralizer.singular_noun(word) else word
	)
	plural = pluralizer.plural(word)

	all_word_forms.extend([word, root, singular, plural])

	return list(set(all_word_forms))


	def get_data_from_web(url: str) -> Any:
	"""
	Fetch data from a given URL using Tika parser.

	Parameters:
	- url: The URL to fetch data from.

	Returns:
	- A dictionary containing the parsed data.
	"""
	response = requests.get(url)
	results = parser.from_buffer(response.content)
	return results


	def get_data_from_given_path(file_path_or_url: str) -> Any:
	"""
	Fetch data from a given file path or URL using Tika parser.

	Parameters:
	- file_path_or_url: The file path or URL to fetch data from.

	Returns:
	- A dictionary containing the parsed data.
	"""
	results = parser.from_file(file_path_or_url)
	return results


	def convert_doc_to_plain_text(file_path_or_url: str) -> str:
	"""
	Convert a document to plain text.

	Parameters:
	- file_path_or_url: The file path or URL of the document.

	Returns:
	- A string containing the plain text content of the document.
	"""
	error_count: int = 0
	try:
	results = get_data_from_web(file_path_or_url)
	except Exception as e:
	print("Error get_data_from_web:")
	print(e)
	print()
	error_count += 1
	try:
	results = get_data_from_given_path(file_path_or_url)
	except Exception as e:
	print("Error get_data_from_given_path:")
	print(e)
	print()
	error_count += 1

	if error_count == 2:
	raise Exception("could not extract content from URL or Path")

	text_content = str(results["content"].strip())
	return text_content


	def get_text_tags(
	text_content: str, keywords: Optional[Iterable[str]] = None
	) -> Iterable[str]:
	"""
	Get tags for a given text based on keywords and their synonyms.

	Parameters:
	- text_content: The text content.
	- keywords: An optional list of keywords to look for.

	Returns:
	- A list of tags.
	"""
	if keywords is None:
	keywords = []
	tags = []

	for keyword in keywords:
	keyword_synonyms = get_synonyms(keyword)
	if keyword.lower() in text_content or any(
	syn in text_content for syn in keyword_synonyms
	):
	tags.append(keyword)
	tags.extend(keyword_synonyms)

	return list(set(tags))


	def is_link(item: str) -> bool:
	"""
	Check if a given item is a link.

	Parameters:
	- item: The item to check.

	Returns:
	- True if the item is a link, False otherwise.
	"""
	return re.match(
	r"http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\\(\\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
	item,
	)


	def categorize_file(file_path_or_url: str) -> str:
	"""
	Categorize a given file based on its type.

	Parameters:
	- file_path_or_url: The file path or URL to categorize.

	Returns:
	- A string representing the category of the file.
	"""
	mime_type, _ = mimetypes.guess_type(file_path_or_url)

	if mime_type:
	if "image" in mime_type:
	return "media"
	elif "text/html" in mime_type:
	return "links"
	elif "application" in mime_type or "text" in mime_type:
	return "documents"
	else:
	return "other"
	elif isinstance(file_path_or_url, str) and is_link(file_path_or_url):
	return "links"
	return "other"


	def get_file_metdata(
	file_path_or_url: str,
	keywords: Optional[List[str]] = None,
	language: str = "en",
	max_ngram_size: int = 1,
	deduplication_threshold: float = 0.9,
	numOfKeywords: int = 5,
	features: Any = None,
	) -> Dict[str, Union[str, Iterable[str]]]:
	"""
	Get metadata for a given file.

	Parameters:
	- file_path_or_url: The file path or URL to get metadata for.
	- keywords: An optional list of keywords to include.
	- language: The language for keyword extraction.
	- max_ngram_size: The maximum n-gram size for keyword extraction.
	- deduplication_threshold: The deduplication threshold for keyword extraction.
	- numOfKeywords: The number of keywords to extract.
	- features: Additional features for keyword extraction.

	Returns:
	- A dictionary containing the metadata for the file.
	"""
	if keywords is None:
	keywords = []
	file_category = categorize_file(file_path_or_url=file_path_or_url)
	text_content = convert_doc_to_plain_text(file_path_or_url=file_path_or_url)
	extracted_keywords = extract_keywords_from_text(
	text_content=text_content,
	language=language,
	max_ngram_size=max_ngram_size,
	deduplication_threshold=deduplication_threshold,
	numOfKeywords=numOfKeywords,
	features=features,
	)
	extracted_keywords_only = [text for text, _ in extracted_keywords]
	keywords.extend(extracted_keywords_only)
	keywords = list(set(keywords))
	keywords_all_word_forms = get_all_word_forms(words=keywords)
	file_tags = get_text_tags(
	text_content=text_content, keywords=keywords_all_word_forms
	)
	file_extension = pathlib.Path(file_path_or_url).suffix

	return {
	"file_path_or_url": str(file_path_or_url),
	"file_extension": file_extension,
	"file_category": file_category,
	"file_tags": file_tags,
	}


	if __name__ == "__main__":
	print("categorize_file:")
	# Test the function
	print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media'
	print("website.html: ", categorize_file("website.html")) # Output: 'links'
	print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents'
	print(
	"presentation.pptx: ", categorize_file("presentation.pptx")
	) # Output: 'documents'
	print(
	"spreadsheet.xlsx: ", categorize_file("spreadsheet.xlsx")
	) # Output: 'documents'
	print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'unknown'
	print("image.jpg: ", categorize_file("image.jpg")) # Output: 'media'
	print("website.html: ", categorize_file("website.html")) # Output: 'links'
	print("document.pdf: ", categorize_file("document.pdf")) # Output: 'documents'
	print("unknown.xyz: ", categorize_file("unknown.xyz")) # Output: 'other'
	print(
	"https://www.youtube.com/watch?v=pebwHmibla4&t=378s: ",
	categorize_file("https://www.youtube.com/watch?v=pebwHmibla4&t=378s"),
	) # Output: 'links'
	print()

	print("get_synonyms:")
	print(get_synonyms(categorize_file("image.jpg")))
	print()

	print("extract_keywords_from_text:")
	print(extract_keywords_from_text("rule"))
	print()

	print("get_all_word_forms:")
	print(get_all_word_forms(words=["rule"]))
	print()

	print("get_file_metdata:")
	pprint.pprint(get_file_metdata("Rules Engine 002.pptx", keywords=["rule"]))
	print()