Skip to content

Instantly share code, notes, and snippets.

@dkavraal
dkavraal / türkçe.py
Last active January 3, 2020 07:46
in Turkish texts detecting which charset among cp1254 (windows 1254) and UTF-8 is used
"""
Letter Codes
------------------------
LETTER Windows-1254 ISO-8859-9 latin5 UTF-8
ğ b'\xf0' b'\xf0' b'\xf0' b'\xc4\x9f'
ı b'\xfd' b'\xfd' b'\xfd' b'\xc4\xb1'
ş b'\xfe' b'\xfe' b'\xfe' b'\xc5\x9f'
ü b'\xfc' b'\xfc' b'\xfc' b'\xc3\xbc'
ö b'\xf6' b'\xf6' b'\xf6' b'\xc3\xb6'
import sys
import numpy
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
import nltk.corpus
from nltk import decorators
import nltk.stem
stemmer_func = nltk.stem.snowball.EnglishStemmer().stem
stopwords = set(nltk.corpus.stopwords.words('english'))