Skip to content

Instantly share code, notes, and snippets.

@lmeulen
lmeulen / easy_eurostat.py
Created October 31, 2022 18:38
easy_eurostat
import os
import time
import gzip
import zipfile
import pandas as pd
import geopandas as gpd
import urllib.request
def file_age(filename):
@lmeulen
lmeulen / summary_class_autodetect.py
Created October 28, 2022 15:11
summary_class_autodetect
def detect_language(self,text:str) -> str:
"""
Detect the language of the given text and initialize the object
accordingly (setting language and set of stop words)
:param str text: The text to analyse to find to laguage
:return: The name of the detected language
:rtype: str
"""
ratios = {}
tokens = word_tokenize(text)
@lmeulen
lmeulen / summary_class_file.py
Created October 28, 2022 13:50
summary_class_file
def summarize_file(self, filename:str, encoding:str="utf-8", split_at:int=50, summary_length:int=None) -> str:
"""
Summarize the content of the given file. The content of the file is splitted in
chunks of a given size. The size is specified as number of sentences. For each chunk
a summary is created. These summaries are concatenated to create the summary of the
total contents of the file
The language and stop word set have been initialized and are used. If no
summary length is given as parameter, the default length is used.
:param str filename: The name of the file with the text to summarize
:param str encoding: The encoding of the file, defaults to utf-8
@lmeulen
lmeulen / summary_class_summary.py
Last active November 12, 2022 10:29
summary_class_summary
def summarize(self, text:str or list, summary_length:int=None) -> str:
"""
Summarize the given text. The text can either be a string or a list of
strings. The string or each element in the list can contain multiple
sentences.
The language and stop word set have been initialized and are used. If no
summary length is given as parameter, the default length is used.
:param (str or list) text: The text to summarize
:param int summary_length: The length of the summary to generate, optional
:return: A string with the summary of the given text
@lmeulen
lmeulen / summary_class_base
Last active November 12, 2022 10:30
summary_class_base
import os
from nltk import tokenize, word_tokenize
from nltk.corpus import stopwords
class Summarizer:
"""
A class used to summarize texts.
This class can summarize texts from strings, list of string or a file.
It can use language specific stop word lists containg words to ignore during the
@lmeulen
lmeulen / text_summary_large.py
Created October 24, 2022 10:28
text_summary_large
with open("longtext.txt", "r", encoding="utf-8") as f:
text = " ".join(f.readlines())
sentences = []
for sent in tokenize.sent_tokenize():
sentences.append(sent)
chunks = [sentences[x:x+50] for x in range(0, len(sentences), 50)]
summary = []
for c in chunks:
@lmeulen
lmeulen / text_summary_function.py
Last active November 5, 2022 12:58
text_summary_function
from nltk import tokenize, word_tokenize
with open("stopwords.txt", "r", encoding="utf-8") as f:
text = " ".join(f.readlines())
STOP_WORDS = set(text.split())
def summarize(text, no_sentences=3):
word_weights={}
for word in word_tokenize(text):
word = word.lower()
@lmeulen
lmeulen / text_summary_combine.py
Created October 23, 2022 14:38
text_summary_combine
summary=""
for sentence,strength in sentence_weights.items():
if strength in highest_weights:
summary += sentence + " "
summary = summary.replace('_', ' ').strip()
@lmeulen
lmeulen / text_summary_sentence_weight.py
Last active November 5, 2022 12:56
text_summary_sentence_weight
sentence_weights={}
for sent in tokenize.sent_tokenize(text):
sentence_weights[sent] = 0
for word in word_tokenize(sent) :
word = word.lower()
if word in word_weights.keys():
sentence_weights[sent] += word_weights[word]
no_sentences = 3
highest_weights = sorted(sentence_weights.values())[-no_sentences:]
@lmeulen
lmeulen / text_summary_word_count.py
Last active October 23, 2022 13:56
text_summary_word_count
text = '...............'
word_weights={}
for word in word_tokenize(text) :
word = word.lower()
if len(word) > 1 and word not in STOP_WORDS:
if word in word_weights.keys():
word_weights[word] += 1
else:
word_weights[word] = 1