Skip to content

Instantly share code, notes, and snippets.

lmeulen /
Last active June 29, 2023 05:28
def remove_dates(text):
text = re.sub("\d{2}[- /.]\d{2}[- /.]\d{,4}", "<DATUM> ", text)
text = re.sub(
"|september|oktober|november|december)([- /.]{,2}(\d{4}|\d{2})){,1})"\
"(?P<n>\D)(?![^<]*>)", "<DATE> ", text)
text = re.sub(
"([- /.]{,2}(\d{4}|\d{2})){,1})(?P<n>\D)(?![^<]*>)", "<DATE> ", text)
return text
lmeulen /
Last active April 27, 2023 16:54
import difflib
import re
def tokenize(s):
return re.split('\s+', s)
def untokenize(ts):
return ' '.join(ts)
def equalize(s1, s2):
l1 = tokenize(s1)
lmeulen / summary_class_base
Last active November 12, 2022 10:30
import os
from nltk import tokenize, word_tokenize
from nltk.corpus import stopwords
class Summarizer:
A class used to summarize texts.
This class can summarize texts from strings, list of string or a file.
It can use language specific stop word lists containg words to ignore during the
lmeulen /
Last active November 12, 2022 10:29
def summarize(self, text:str or list, summary_length:int=None) -> str:
Summarize the given text. The text can either be a string or a list of
strings. The string or each element in the list can contain multiple
The language and stop word set have been initialized and are used. If no
summary length is given as parameter, the default length is used.
:param (str or list) text: The text to summarize
:param int summary_length: The length of the summary to generate, optional
:return: A string with the summary of the given text
lmeulen /
Last active November 5, 2022 12:58
from nltk import tokenize, word_tokenize
with open("stopwords.txt", "r", encoding="utf-8") as f:
text = " ".join(f.readlines())
STOP_WORDS = set(text.split())
def summarize(text, no_sentences=3):
for word in word_tokenize(text):
word = word.lower()
lmeulen /
Last active November 5, 2022 12:56
for sent in tokenize.sent_tokenize(text):
sentence_weights[sent] = 0
for word in word_tokenize(sent) :
word = word.lower()
if word in word_weights.keys():
sentence_weights[sent] += word_weights[word]
no_sentences = 3
highest_weights = sorted(sentence_weights.values())[-no_sentences:]
lmeulen /
Created October 31, 2022 18:38
import os
import time
import gzip
import zipfile
import pandas as pd
import geopandas as gpd
import urllib.request
def file_age(filename):
lmeulen /
Created October 28, 2022 15:11
def detect_language(self,text:str) -> str:
Detect the language of the given text and initialize the object
accordingly (setting language and set of stop words)
:param str text: The text to analyse to find to laguage
:return: The name of the detected language
:rtype: str
ratios = {}
tokens = word_tokenize(text)
lmeulen /
Created October 28, 2022 13:50
def summarize_file(self, filename:str, encoding:str="utf-8", split_at:int=50, summary_length:int=None) -> str:
Summarize the content of the given file. The content of the file is splitted in
chunks of a given size. The size is specified as number of sentences. For each chunk
a summary is created. These summaries are concatenated to create the summary of the
total contents of the file
The language and stop word set have been initialized and are used. If no
summary length is given as parameter, the default length is used.
:param str filename: The name of the file with the text to summarize
:param str encoding: The encoding of the file, defaults to utf-8
lmeulen /
Created October 24, 2022 10:28
with open("longtext.txt", "r", encoding="utf-8") as f:
text = " ".join(f.readlines())
sentences = []
for sent in tokenize.sent_tokenize():
chunks = [sentences[x:x+50] for x in range(0, len(sentences), 50)]
summary = []
for c in chunks: