Skip to content

Instantly share code, notes, and snippets.

@andreasvc
Last active Oct 3, 2019
Embed
What would you like to do?
Apply polyglot language detection recursively
"""Apply polyglot language detection to all .txt files under current directory
(searched recursively), write report in tab-separated file detectedlangs.tsv.
"""
import os
from glob import glob
from polyglot.detect import Detector
from polyglot.detect.base import UnknownLanguage
def main():
"""Main."""
with open('detectedlangs.tsv', 'w', encoding='utf8') as out:
print(
'filename', 'lang', 'confidence', 'read_bytes',
sep='\t', file=out)
for fname in glob('**/*.txt', recursive=True):
with open(fname, encoding='utf8') as inp:
text = inp.read()
# https://github.com/aboSamoor/polyglot/issues/71#issuecomment-445199949
filteredtext = ''.join(x for x in text if x.isprintable())
try:
res = Detector(filteredtext)
except UnknownLanguage:
print(
fname,
'unknown',
0,
len(text),
sep='\t', file=out)
else:
print(
fname,
res.language.code,
res.language.confidence,
res.language.read_bytes,
sep='\t', file=out)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment