Skip to content

Instantly share code, notes, and snippets.

@muffincode
Created October 15, 2018 14:17
Show Gist options
  • Save muffincode/ddf384aac93c29db8268578c4fb8b699 to your computer and use it in GitHub Desktop.
Save muffincode/ddf384aac93c29db8268578c4fb8b699 to your computer and use it in GitHub Desktop.
Get language and location for webpages
import urllib.request
import html2text
import nltk
from langdetect import detect
from geotext import GeoText
# Some websites need headers (source: SO)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
# websites.csv = one url per line
file = open("websites.csv", "r")
for line in file:
try:
# URL + Request
url = line.rstrip()
req = urllib.request.Request(url, None, headers)
contents = urllib.request.urlopen(req).read()
# Get content from HTML
text = html2text.html2text(contents.decode())
# Get locale
locale = detect(text)
# Get location
places = GeoText(text)
locations = list(set(places.countries))
locations_string = ','.join(locations)
print(url,',',locale,',"',locations_string,'"')
except:
print(req,',error,error')
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment