Skip to content

Instantly share code, notes, and snippets.

@dardanxhymshiti
Last active July 3, 2020 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dardanxhymshiti/09ed94da1d7e32d0d1cf0a53adcafe1c to your computer and use it in GitHub Desktop.
Save dardanxhymshiti/09ed94da1d7e32d0d1cf0a53adcafe1c to your computer and use it in GitHub Desktop.
def describe_text(text):
import re, string
description = dict()
# remove punctuation marks
text_wo_punctuation_marks = re.sub(f'[%s]' % re.escape(string.punctuation), '', text)
# tokens of the text without punctuation marks
tokens_of_text_wo_punctuation_marks = text_wo_punctuation_marks.split(' ')
# list of sentences
pattern = re.compile(r'([A-Z][^\.!?]*[\.!?])', re.M)
list_of_sentences = re.findall(pattern, text)
# sentence character and word counts
list_of_sentence_character_count = [len(sentence) for sentence in list_of_sentences]
list_of_sentence_words_counts = [len(sentence.split(' ')) for sentence in list_of_sentences]
description['Number of characters'] = len(text)
description['Number of words'] = len(tokens_of_text_wo_punctuation_marks)
description['Number of unique words'] = len(set(tokens_of_text_wo_punctuation_marks))
description['Number of sentences'] = len(list_of_sentences)
description['Number of new lines'] = len([char for char in text if char == '\n'])
description['Number of punctuatino marks'] = len([char for char in text if char in string.punctuation])
description['Average words per sentence'] = round(len(tokens_of_text_wo_punctuation_marks)/len(list_of_sentences), 2)
description['Average word length'] = round(sum([len(token) for token in tokens_of_text_wo_punctuation_marks])/len(tokens_of_text_wo_punctuation_marks), 2)
description['Maximum characters in a sentence'] = max(list_of_sentence_character_count)
description['Minimum characters in a sentence'] = min(list_of_sentence_character_count)
description['Maximum words in a sentence'] = max(list_of_sentence_words_counts)
description['Minimum words in a sentence'] = min(list_of_sentence_words_counts)
description['Contains numbers'] = any(char.isdigit() for char in text)
description['Contains unicode characters'] = any([ord(char) > 255] for char in text)
description['Contains interrogative sentences'] = '?' in text
description['Contains exclamatory sentences'] = '!' in text
return description
# Test
text = """Even at a school like CSM, the knitwear avenue is still considered a ‘niche’ pursuit. “I still get a lot of patronising remarks about knitting, mostly from men, who say things like ‘Oh so you just knit in a circle?’ The reality is that I’m operating intense machinery on a day-to-day basis. And even if I was knitting in a circle, what’s wrong with that?”"""
describe_text(text)
# {
# 'Number of characters': 355,
# 'Number of words': 65,
# 'Number of unique words': 55,
# 'Number of sentences': 4,
# 'Number of new lines': 0,
# 'Number of punctuatino marks': 10,
# 'Average words per sentence': 16.25,
# 'Average word length': 4.32,
# 'Maximum characters in a sentence': 127,
# 'Minimum characters in a sentence': 63,
# 'Maximum words in a sentence': 25,
# 'Minimum words in a sentence': 12,
# 'Contains numbers': False,
# 'Contains unicode characters': True,
# 'Contains interrogative sentences': True,
# 'Contains exclamatory sentences': False
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment