Skip to content

Instantly share code, notes, and snippets.

@rraallvv
Last active December 27, 2020 23:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rraallvv/909e0e63b3fedc053a63332ba8f28de0 to your computer and use it in GitHub Desktop.
Save rraallvv/909e0e63b3fedc053a63332ba8f28de0 to your computer and use it in GitHub Desktop.
Lingorado IPA phonetic transcription
#!/usr/bin/python
import requests
from lxml import html
import re
from random import randint
from time import sleep
import sys
import random
base_url='http://lingorado.com/ipa/'
proxy_list = ['89.187.217.114:80', '92.126.152.221:8080', '35.188.62.145:80', '171.255.199.3:3128', '178.62.91.24:8118', '203.74.4.4:80', '61.69.92.106:80', '94.177.180.226:80', '98.182.126.155:80', '203.74.4.0:80', '51.254.16.106:8080', '219.76.4.72:88', '203.74.4.6:80', '125.16.128.118:3128', '217.15.85.202:8080', '24.249.80.22:3128', '34.252.130.88:8080', '62.214.70.116:3128', '89.187.217.116:80', '219.76.4.12:88']
current_proxy = 0
#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined
def get_transcription(word):
global current_proxy
payload = {'text_to_transcribe': word, 'output_dialect': 'am', 'output_style': 'only_tr', 'weak_forms': 'on', 'speech_support': 1, 'submit': 'Show+transcription'}
response = {}
while True:
proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
current_proxy = (current_proxy + 1) % len(proxy_list)
try:
response = requests.post(base_url, data=payload, proxies=proxy_dict)
#response = requests.post(base_url, data=payload)
except KeyboardInterrupt:
return ""
except:
#print '\033[91m' + "Connection error with " + proxy_dict['http'] + '\033[0m'
continue
#sleep(randint(1,10))
#print response.content
tree = html.fromstring(response.content)
transcribed_word = {}
#print tree.xpath("//span[@class='transcribed_word']")
try:
transcribed_word = tree.xpath("//span[@class='transcribed_word']")[0]
except:
try:
transcribed_word = tree.xpath("//span[@class='transcription_missing']")[0]
return "Transcription missing!"
except:
#print '\033[91m' + "Couldn't get the transcription from " + proxy_dict['http'] + '\033[0m'
continue
#print transcribed_word
trans = ''
id = transcribed_word.xpath(".//a/@id")
if len(id) == 0:
return '[' + transcribed_word.xpath(".//text()")[0] + ']'
else:
id = id[0]
if not id:
trans = '[' + transcribed_word.xpath(".//text()")[0] + ']'
else:
#print id
m = re.search('(.*)_[^_]+', id)
if m:
id = m.group(1)
#print id
m = re.search(id + '_notes = "([^"]+)"', response.content)
if m:
trans = re.sub('\d+\. ', '', m.group(1))
trans = re.sub('(<i>|</i>|:)', '', trans)
trans = re.sub('<br ?/>', ', ', trans)
trans = re.sub(' ,', ',', trans)
else:
print '\033[91m' + "Error! Couldn't make transcription." + '\033[0m'
return trans
random.shuffle(proxy_list)
with open('words-list.txt') as lines:
payload = ''
for word in lines:
transcription = get_transcription(word)
if transcription:
print word.replace('\n', ' ') + '\t' + transcription
else:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment