rraallvv/cambridge_dictionary.py

## cambridge_dictionary.py
#!/usr/bin/python

import requests
from lxml import html
import re
from random import randint
from time import sleep
import sys
import random
import fake_useragent
from fake_useragent import UserAgent

base_url='http://dictionary.cambridge.org/dictionary/english/'
proxy_list = ['183.111.169.207:3128', '54.157.185.100:10000', '185.92.220.84:3128', '91.221.61.126:3128', '161.68.250.139:80', '183.111.169.203:3128', '200.35.187.114:8080', '208.85.182.130:8080', '52.33.65.51:80', '207.28.38.2:3128', '12.207.13.20:3128', '173.220.170.242:7004', '69.143.93.173:3128', '107.170.232.185:80', '119.52.160.2:3128', '124.192.17.123:3128', '107.170.232.185:8080', '1.234.45.50:3128', '118.26.143.202:3128', '92.46.122.98:3128', '221.199.203.70:3128', '107.151.152.218:80', '60.191.153.12:3128', '118.144.187.254:3128', '118.163.108.104:3128']
current_proxy = 0
header_dict = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}

#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

def get_transcription(word):
	global current_proxy
	response = {}
	while True:
		proxy_dict = {'http'  : 'http://' + proxy_list[current_proxy]}
		header_dict['user-agent'] = UserAgent().random
		current_proxy = (current_proxy + 1) % len(proxy_list)
		url = base_url + word.lower().replace("'", '-') + "?q=" + word
		#print url
		try:
			response = requests.get(url, proxies=proxy_dict, headers=header_dict)
		except KeyboardInterrupt:
			return ""
		except:
			print '\033[91m' + "Connection error with " + proxy_dict['http'] + " " + header_dict['user-agent'] + '\033[0m'
			continue

		#sleep(randint(1,10))
		#print response.content

		tree = html.fromstring(response.content)

		transcribed_words = []

		american = tree.xpath("//div[@data-tab='ds-american-english']")
		if type(american) is list:
			if len(american) > 0:
				american = american[0]
			else:
				continue
		#print american
		alternatives = american.xpath(".//div[contains(@class, 'entry-body__el')]")
		for alternative in alternatives:
			transcriptions = alternative.xpath(".//span[@class='pron']")
			if len(transcriptions) == 0:
				continue
			transcriptions = transcriptions[0].xpath(".//span[contains(@class, 'ipa')]/text()")
			transcription = "/" + (", ".join(transcriptions)) + "/"
			audio_url = alternative.xpath(".//span[@data-src-mp3]/@data-src-mp3")[0]
			transcription_types = alternative.xpath(".//span[contains(@class, 'posgram')]")
			if len(transcription_types) > 0:
				transcription_types = transcription_types[0].xpath(".//span[@class='pos']/text()")
			else:
				transcription_types = []
			found = False
			for transcribed_word in transcribed_words:
				#print transcribed_word[1]
				#print transcription_types
				if transcribed_word[0] == transcription:
					transcribed_word[1] = list(set(transcribed_word[1]).union(set(transcription_types)))
					#print transcribed_word[0]
					#print transcribed_word[1]
					found = True
			if not found:
				#print transcription
				#print transcription_types
				transcribed_words.append([transcription, transcription_types, audio_url])
		#print transcribed_words
		#return ""
		transcriptions = []
		for transcribed_word in transcribed_words:
			transcriptions.append(transcribed_word[0] + ' ' + (", ".join(transcribed_word[1])) + '\t' + transcribed_word[2])
		return " | ".join(transcriptions)

random.shuffle(proxy_list)

with open('words-list.txt') as lines:
	payload = ''
	for word in lines:
		word = word.replace('\n', '')
		transcription = get_transcription(word)
		if transcription:
			print word + '\t' + transcription
		else:
			break
	#!/usr/bin/python

	import requests
	from lxml import html
	import re
	from random import randint
	from time import sleep
	import sys
	import random
	import fake_useragent
	from fake_useragent import UserAgent

	base_url='http://dictionary.cambridge.org/dictionary/english/'
	proxy_list = ['183.111.169.207:3128', '54.157.185.100:10000', '185.92.220.84:3128', '91.221.61.126:3128', '161.68.250.139:80', '183.111.169.203:3128', '200.35.187.114:8080', '208.85.182.130:8080', '52.33.65.51:80', '207.28.38.2:3128', '12.207.13.20:3128', '173.220.170.242:7004', '69.143.93.173:3128', '107.170.232.185:80', '119.52.160.2:3128', '124.192.17.123:3128', '107.170.232.185:8080', '1.234.45.50:3128', '118.26.143.202:3128', '92.46.122.98:3128', '221.199.203.70:3128', '107.151.152.218:80', '60.191.153.12:3128', '118.144.187.254:3128', '118.163.108.104:3128']
	current_proxy = 0
	header_dict = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}

	#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

	def get_transcription(word):
	global current_proxy
	response = {}
	while True:
	proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
	header_dict['user-agent'] = UserAgent().random
	current_proxy = (current_proxy + 1) % len(proxy_list)
	url = base_url + word.lower().replace("'", '-') + "?q=" + word
	#print url
	try:
	response = requests.get(url, proxies=proxy_dict, headers=header_dict)
	except KeyboardInterrupt:
	return ""
	except:
	print '\033[91m' + "Connection error with " + proxy_dict['http'] + " " + header_dict['user-agent'] + '\033[0m'
	continue

	#sleep(randint(1,10))
	#print response.content

	tree = html.fromstring(response.content)

	transcribed_words = []

	american = tree.xpath("//div[@data-tab='ds-american-english']")
	if type(american) is list:
	if len(american) > 0:
	american = american[0]
	else:
	continue
	#print american
	alternatives = american.xpath(".//div[contains(@class, 'entry-body__el')]")
	for alternative in alternatives:
	transcriptions = alternative.xpath(".//span[@class='pron']")
	if len(transcriptions) == 0:
	continue
	transcriptions = transcriptions[0].xpath(".//span[contains(@class, 'ipa')]/text()")
	transcription = "/" + (", ".join(transcriptions)) + "/"
	audio_url = alternative.xpath(".//span[@data-src-mp3]/@data-src-mp3")[0]
	transcription_types = alternative.xpath(".//span[contains(@class, 'posgram')]")
	if len(transcription_types) > 0:
	transcription_types = transcription_types[0].xpath(".//span[@class='pos']/text()")
	else:
	transcription_types = []
	found = False
	for transcribed_word in transcribed_words:
	#print transcribed_word[1]
	#print transcription_types
	if transcribed_word[0] == transcription:
	transcribed_word[1] = list(set(transcribed_word[1]).union(set(transcription_types)))
	#print transcribed_word[0]
	#print transcribed_word[1]
	found = True
	if not found:
	#print transcription
	#print transcription_types
	transcribed_words.append([transcription, transcription_types, audio_url])
	#print transcribed_words
	#return ""
	transcriptions = []
	for transcribed_word in transcribed_words:
	transcriptions.append(transcribed_word[0] + ' ' + (", ".join(transcribed_word[1])) + '\t' + transcribed_word[2])
	return " \| ".join(transcriptions)

	random.shuffle(proxy_list)

	with open('words-list.txt') as lines:
	payload = ''
	for word in lines:
	word = word.replace('\n', '')
	transcription = get_transcription(word)
	if transcription:
	print word + '\t' + transcription
	else:
	break