This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import gzip | |
import zipfile | |
import pandas as pd | |
import geopandas as gpd | |
import urllib.request | |
def file_age(filename): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def detect_language(self,text:str) -> str: | |
""" | |
Detect the language of the given text and initialize the object | |
accordingly (setting language and set of stop words) | |
:param str text: The text to analyse to find to laguage | |
:return: The name of the detected language | |
:rtype: str | |
""" | |
ratios = {} | |
tokens = word_tokenize(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def summarize_file(self, filename:str, encoding:str="utf-8", split_at:int=50, summary_length:int=None) -> str: | |
""" | |
Summarize the content of the given file. The content of the file is splitted in | |
chunks of a given size. The size is specified as number of sentences. For each chunk | |
a summary is created. These summaries are concatenated to create the summary of the | |
total contents of the file | |
The language and stop word set have been initialized and are used. If no | |
summary length is given as parameter, the default length is used. | |
:param str filename: The name of the file with the text to summarize | |
:param str encoding: The encoding of the file, defaults to utf-8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def summarize(self, text:str or list, summary_length:int=None) -> str: | |
""" | |
Summarize the given text. The text can either be a string or a list of | |
strings. The string or each element in the list can contain multiple | |
sentences. | |
The language and stop word set have been initialized and are used. If no | |
summary length is given as parameter, the default length is used. | |
:param (str or list) text: The text to summarize | |
:param int summary_length: The length of the summary to generate, optional | |
:return: A string with the summary of the given text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from nltk import tokenize, word_tokenize | |
from nltk.corpus import stopwords | |
class Summarizer: | |
""" | |
A class used to summarize texts. | |
This class can summarize texts from strings, list of string or a file. | |
It can use language specific stop word lists containg words to ignore during the |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open("longtext.txt", "r", encoding="utf-8") as f: | |
text = " ".join(f.readlines()) | |
sentences = [] | |
for sent in tokenize.sent_tokenize(): | |
sentences.append(sent) | |
chunks = [sentences[x:x+50] for x in range(0, len(sentences), 50)] | |
summary = [] | |
for c in chunks: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import tokenize, word_tokenize | |
with open("stopwords.txt", "r", encoding="utf-8") as f: | |
text = " ".join(f.readlines()) | |
STOP_WORDS = set(text.split()) | |
def summarize(text, no_sentences=3): | |
word_weights={} | |
for word in word_tokenize(text): | |
word = word.lower() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
summary="" | |
for sentence,strength in sentence_weights.items(): | |
if strength in highest_weights: | |
summary += sentence + " " | |
summary = summary.replace('_', ' ').strip() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sentence_weights={} | |
for sent in tokenize.sent_tokenize(text): | |
sentence_weights[sent] = 0 | |
for word in word_tokenize(sent) : | |
word = word.lower() | |
if word in word_weights.keys(): | |
sentence_weights[sent] += word_weights[word] | |
no_sentences = 3 | |
highest_weights = sorted(sentence_weights.values())[-no_sentences:] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
text = '...............' | |
word_weights={} | |
for word in word_tokenize(text) : | |
word = word.lower() | |
if len(word) > 1 and word not in STOP_WORDS: | |
if word in word_weights.keys(): | |
word_weights[word] += 1 | |
else: | |
word_weights[word] = 1 |
NewerOlder