dardanxhymshiti/describe_text.py

## describe_text.py
def describe_text(text):
    import re, string

    description = dict()

    # remove punctuation marks
    text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text)

    # tokens of the text without punctuation marks
    tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ')

    # list of sentences
    pattern = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
    list_of_sentences = re.findall(pattern, text)

    # sentence character and word counts
    list_of_sentence_character_count = [len(sentence) for sentence in list_of_sentences]
    list_of_sentence_words_counts = [len(sentence.split(' ')) for sentence in list_of_sentences]

    description['Number of characters'] = len(text)
    description['Number of words'] = len(tokens_of_text_wo_punctuation_marks)
    description['Number of unique words'] = len(set(tokens_of_text_wo_punctuation_marks))
    description['Number of sentences'] = len(list_of_sentences)
    description['Number of new lines'] = len([char for char in text if char == '\n'])
    description['Number of punctuatino marks'] = len([char for char in text if char in string.punctuation])
    description['Average words per sentence'] = round(len(tokens_of_text_wo_punctuation_marks)/len(list_of_sentences), 2)
    description['Average word length'] = round(sum([len(token) for token in tokens_of_text_wo_punctuation_marks])/len(tokens_of_text_wo_punctuation_marks), 2)
    description['Maximum characters in a sentence'] = max(list_of_sentence_character_count)
    description['Minimum characters in a sentence'] = min(list_of_sentence_character_count)
    description['Maximum words in a sentence'] = max(list_of_sentence_words_counts)
    description['Minimum words in a sentence'] = min(list_of_sentence_words_counts)
    description['Contains numbers'] = any(char.isdigit() for char in text)
    description['Contains unicode characters'] = any([ord(char) > 255] for char in text)
    description['Contains interrogative sentences'] = '?' in text
    description['Contains exclamatory sentences'] = '!' in text

    return description

# Test
text = """Even at a school like CSM, the knitwear avenue is still considered a ‘niche’ pursuit. “I still get a lot of patronising remarks about knitting, mostly from men, who say things like ‘Oh so you just knit in a circle?’ The reality is that I’m operating intense machinery on a day-to-day basis. And even if I was knitting in a circle, what’s wrong with that?”"""
describe_text(text)

# {
#     'Number of characters': 355,
#     'Number of words': 65,
#     'Number of unique words': 55,
#     'Number of sentences': 4,
#     'Number of new lines': 0,
#     'Number of punctuatino marks': 10,
#     'Average words per sentence': 16.25,
#     'Average word length': 4.32,
#     'Maximum characters in a sentence': 127,
#     'Minimum characters in a sentence': 63,
#     'Maximum words in a sentence': 25,
#     'Minimum words in a sentence': 12,
#     'Contains numbers': False,
#     'Contains unicode characters': True,
#     'Contains interrogative sentences': True,
#     'Contains exclamatory sentences': False
# }
	def describe_text(text):
	import re, string

	description = dict()

	# remove punctuation marks
	text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text)

	# tokens of the text without punctuation marks
	tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ')

	# list of sentences
	pattern = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
	list_of_sentences = re.findall(pattern, text)

	# sentence character and word counts
	list_of_sentence_character_count = [len(sentence) for sentence in list_of_sentences]
	list_of_sentence_words_counts = [len(sentence.split(' ')) for sentence in list_of_sentences]

	description['Number of characters'] = len(text)
	description['Number of words'] = len(tokens_of_text_wo_punctuation_marks)
	description['Number of unique words'] = len(set(tokens_of_text_wo_punctuation_marks))
	description['Number of sentences'] = len(list_of_sentences)
	description['Number of new lines'] = len([char for char in text if char == '\n'])
	description['Number of punctuatino marks'] = len([char for char in text if char in string.punctuation])
	description['Average words per sentence'] = round(len(tokens_of_text_wo_punctuation_marks)/len(list_of_sentences), 2)
	description['Average word length'] = round(sum([len(token) for token in tokens_of_text_wo_punctuation_marks])/len(tokens_of_text_wo_punctuation_marks), 2)
	description['Maximum characters in a sentence'] = max(list_of_sentence_character_count)
	description['Minimum characters in a sentence'] = min(list_of_sentence_character_count)
	description['Maximum words in a sentence'] = max(list_of_sentence_words_counts)
	description['Minimum words in a sentence'] = min(list_of_sentence_words_counts)
	description['Contains numbers'] = any(char.isdigit() for char in text)
	description['Contains unicode characters'] = any([ord(char) > 255] for char in text)
	description['Contains interrogative sentences'] = '?' in text
	description['Contains exclamatory sentences'] = '!' in text

	return description

	# Test
	text = """Even at a school like CSM, the knitwear avenue is still considered a ‘niche’ pursuit. “I still get a lot of patronising remarks about knitting, mostly from men, who say things like ‘Oh so you just knit in a circle?’ The reality is that I’m operating intense machinery on a day-to-day basis. And even if I was knitting in a circle, what’s wrong with that?”"""
	describe_text(text)

	# {
	# 'Number of characters': 355,
	# 'Number of words': 65,
	# 'Number of unique words': 55,
	# 'Number of sentences': 4,
	# 'Number of new lines': 0,
	# 'Number of punctuatino marks': 10,
	# 'Average words per sentence': 16.25,
	# 'Average word length': 4.32,
	# 'Maximum characters in a sentence': 127,
	# 'Minimum characters in a sentence': 63,
	# 'Maximum words in a sentence': 25,
	# 'Minimum words in a sentence': 12,
	# 'Contains numbers': False,
	# 'Contains unicode characters': True,
	# 'Contains interrogative sentences': True,
	# 'Contains exclamatory sentences': False
	# }