rraallvv/lingorado_ipa_transcription.py

## lingorado_ipa_transcription.py
#!/usr/bin/python

import requests
from lxml import html
import re
from random import randint
from time import sleep
import sys
import random

base_url='http://lingorado.com/ipa/'
proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88']
current_proxy = 0

#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

def get_transcription(word):
	global current_proxy
	payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'}
	response = {}
	while True:
		proxy_dict = {'http'  : 'http://' + proxy_list[current_proxy]}
		current_proxy = (current_proxy + 1) % len(proxy_list)
		try:
			response = requests.post(base_url, data=payload, proxies=proxy_dict)
			#response = requests.post(base_url, data=payload)
		except KeyboardInterrupt:
			return ""
		except:
			#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m'
			continue

		#sleep(randint(1,10))
		#print response.content

		tree = html.fromstring(response.content)

		transcribed_word = {}

		#print tree.xpath("//span[@class='transcribed_word']")
		try:
			transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0]
		except:
			try:
				transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0]
				return "Transcription missing!"
			except:
				#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m'
				continue
		#print transcribed_word

		trans = ''

		id = transcribed_word.xpath(".//a/@id")
		if len(id) == 0:
			return '[' + transcribed_word.xpath(".//text()")[0] + ']'
		else:
			id = id[0]

		if not id:
			trans = '[' + transcribed_word.xpath(".//text()")[0] + ']'
		else:
			#print id
			m = re.search('(.*)_[^_]+', id)
			if m:
				id = m.group(1)
			#print id
			m = re.search(id + '_notes = "([^"]+)"', response.content)
			if m:
				trans = re.sub('\d+\. ', '', m.group(1))
				trans = re.sub('(<i>|</i>|:)', '', trans)
				trans = re.sub('<br ?/>', ', ', trans)
				trans = re.sub(' ,', ',', trans)
			else:
				print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m'
		return trans

random.shuffle(proxy_list)

with open('words-list.txt') as lines:
	payload = ''
	for word in lines:
		transcription = get_transcription(word)
		if transcription:
			print word.replace('\n', ' ') + '\t' + transcription
		else:
			break
	#!/usr/bin/python

	import requests
	from lxml import html
	import re
	from random import randint
	from time import sleep
	import sys
	import random

	base_url='http://lingorado.com/ipa/'
	proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88']
	current_proxy = 0

	#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined

	def get_transcription(word):
	global current_proxy
	payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'}
	response = {}
	while True:
	proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
	current_proxy = (current_proxy + 1) % len(proxy_list)
	try:
	response = requests.post(base_url, data=payload, proxies=proxy_dict)
	#response = requests.post(base_url, data=payload)
	except KeyboardInterrupt:
	return ""
	except:
	#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m'
	continue

	#sleep(randint(1,10))
	#print response.content

	tree = html.fromstring(response.content)

	transcribed_word = {}

	#print tree.xpath("//span[@class='transcribed_word']")
	try:
	transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0]
	except:
	try:
	transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0]
	return "Transcription missing!"
	except:
	#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m'
	continue
	#print transcribed_word

	trans = ''

	id = transcribed_word.xpath(".//a/@id")
	if len(id) == 0:
	return '[' + transcribed_word.xpath(".//text()")[0] + ']'
	else:
	id = id[0]

	if not id:
	trans = '[' + transcribed_word.xpath(".//text()")[0] + ']'
	else:
	#print id
	m = re.search('(.*)_[^_]+', id)
	if m:
	id = m.group(1)
	#print id
	m = re.search(id + '_notes = "([^"]+)"', response.content)
	if m:
	trans = re.sub('\d+\. ', '', m.group(1))
	trans = re.sub('(<i>\|</i>\|:)', '', trans)
	trans = re.sub('<br ?/>', ', ', trans)
	trans = re.sub(' ,', ',', trans)
	else:
	print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m'
	return trans

	random.shuffle(proxy_list)

	with open('words-list.txt') as lines:
	payload = ''
	for word in lines:
	transcription = get_transcription(word)
	if transcription:
	print word.replace('\n', ' ') + '\t' + transcription
	else:
	break