Skip to content

Instantly share code, notes, and snippets.

@yohannawaliya
Forked from edsu/langdetect_test.py
Created June 11, 2019 17:08
Show Gist options
  • Save yohannawaliya/82b7162bc0f927d7086bd07ec7c50d69 to your computer and use it in GitHub Desktop.
Save yohannawaliya/82b7162bc0f927d7086bd07ec7c50d69 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import csv
from xml.etree import ElementTree
from langdetect import detect_langs
from requests_html import HTMLSession
http = HTMLSession()
def langs(url):
resp = http.get(url)
main = resp.html.find('section.main', first=True)
return {l.lang: l.prob for l in detect_langs(main.text)}
def urls():
doc = ElementTree.fromstring(http.get('https://usbeketrica.com/sitemap.xml').text)
for url in doc.findall('.//{http://www.google.com/schemas/sitemap/0.84}loc'):
if url.text.startswith('https://usbeketrica.com/article/'):
yield url.text
output = csv.writer(open('langs.csv', 'w'))
output.writerow(['url', 'fr', 'en'])
for url in urls():
results = langs(url)
print(url, results)
output.writerow([url, results.get('fr', 0), results.get('en', 0)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment