Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active June 11, 2019 17:08
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save edsu/b8cb35df0cb80723c414230a9d2abb01 to your computer and use it in GitHub Desktop.
Save edsu/b8cb35df0cb80723c414230a9d2abb01 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import csv
from xml.etree import ElementTree
from langdetect import detect_langs
from requests_html import HTMLSession
http = HTMLSession()
def langs(url):
resp = http.get(url)
main = resp.html.find('section.main', first=True)
return {l.lang: l.prob for l in detect_langs(main.text)}
def urls():
doc = ElementTree.fromstring(http.get('https://usbeketrica.com/sitemap.xml').text)
for url in doc.findall('.//{http://www.google.com/schemas/sitemap/0.84}loc'):
if url.text.startswith('https://usbeketrica.com/article/'):
yield url.text
output = csv.writer(open('langs.csv', 'w'))
output.writerow(['url', 'fr', 'en'])
for url in urls():
results = langs(url)
print(url, results)
output.writerow([url, results.get('fr', 0), results.get('en', 0)])
@yohannawaliya
Copy link

yohannawaliya commented Jun 10, 2019

it is working for me

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment