Last active
July 3, 2020 14:39
-
-
Save dardanxhymshiti/09ed94da1d7e32d0d1cf0a53adcafe1c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def describe_text(text): | |
import re, string | |
description = dict() | |
# remove punctuation marks | |
text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text) | |
# tokens of the text without punctuation marks | |
tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ') | |
# list of sentences | |
pattern = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M) | |
list_of_sentences = re.findall(pattern, text) | |
# sentence character and word counts | |
list_of_sentence_character_count = [len(sentence) for sentence in list_of_sentences] | |
list_of_sentence_words_counts = [len(sentence.split(' ')) for sentence in list_of_sentences] | |
description['Number of characters'] = len(text) | |
description['Number of words'] = len(tokens_of_text_wo_punctuation_marks) | |
description['Number of unique words'] = len(set(tokens_of_text_wo_punctuation_marks)) | |
description['Number of sentences'] = len(list_of_sentences) | |
description['Number of new lines'] = len([char for char in text if char == '\n']) | |
description['Number of punctuatino marks'] = len([char for char in text if char in string.punctuation]) | |
description['Average words per sentence'] = round(len(tokens_of_text_wo_punctuation_marks)/len(list_of_sentences), 2) | |
description['Average word length'] = round(sum([len(token) for token in tokens_of_text_wo_punctuation_marks])/len(tokens_of_text_wo_punctuation_marks), 2) | |
description['Maximum characters in a sentence'] = max(list_of_sentence_character_count) | |
description['Minimum characters in a sentence'] = min(list_of_sentence_character_count) | |
description['Maximum words in a sentence'] = max(list_of_sentence_words_counts) | |
description['Minimum words in a sentence'] = min(list_of_sentence_words_counts) | |
description['Contains numbers'] = any(char.isdigit() for char in text) | |
description['Contains unicode characters'] = any([ord(char) > 255] for char in text) | |
description['Contains interrogative sentences'] = '?' in text | |
description['Contains exclamatory sentences'] = '!' in text | |
return description | |
# Test | |
text = """Even at a school like CSM, the knitwear avenue is still considered a ‘niche’ pursuit. “I still get a lot of patronising remarks about knitting, mostly from men, who say things like ‘Oh so you just knit in a circle?’ The reality is that I’m operating intense machinery on a day-to-day basis. And even if I was knitting in a circle, what’s wrong with that?”""" | |
describe_text(text) | |
# { | |
# 'Number of characters': 355, | |
# 'Number of words': 65, | |
# 'Number of unique words': 55, | |
# 'Number of sentences': 4, | |
# 'Number of new lines': 0, | |
# 'Number of punctuatino marks': 10, | |
# 'Average words per sentence': 16.25, | |
# 'Average word length': 4.32, | |
# 'Maximum characters in a sentence': 127, | |
# 'Minimum characters in a sentence': 63, | |
# 'Maximum words in a sentence': 25, | |
# 'Minimum words in a sentence': 12, | |
# 'Contains numbers': False, | |
# 'Contains unicode characters': True, | |
# 'Contains interrogative sentences': True, | |
# 'Contains exclamatory sentences': False | |
# } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment