Skip to content

Instantly share code, notes, and snippets.

@rraallvv
Created August 12, 2017 16:42
Show Gist options
  • Save rraallvv/61db560e57617096c9aba36b92e37a53 to your computer and use it in GitHub Desktop.
Save rraallvv/61db560e57617096c9aba36b92e37a53 to your computer and use it in GitHub Desktop.
Cambridge Dictionary IPA phonetic transcription
#!/usr/bin/python
import requests
from lxml import html
import re
from random import randint
from time import sleep
import sys
import random
import fake_useragent
from fake_useragent import UserAgent
base_url='http://dictionary.cambridge.org/dictionary/english/'
proxy_list = ['183.111.169.207:3128', '54.157.185.100:10000', '185.92.220.84:3128', '91.221.61.126:3128', '161.68.250.139:80', '183.111.169.203:3128', '200.35.187.114:8080', '208.85.182.130:8080', '52.33.65.51:80', '207.28.38.2:3128', '12.207.13.20:3128', '173.220.170.242:7004', '69.143.93.173:3128', '107.170.232.185:80', '119.52.160.2:3128', '124.192.17.123:3128', '107.170.232.185:8080', '1.234.45.50:3128', '118.26.143.202:3128', '92.46.122.98:3128', '221.199.203.70:3128', '107.151.152.218:80', '60.191.153.12:3128', '118.144.187.254:3128', '118.163.108.104:3128']
current_proxy = 0
header_dict = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}
#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined
def get_transcription(word):
global current_proxy
response = {}
while True:
proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
header_dict['user-agent'] = UserAgent().random
current_proxy = (current_proxy + 1) % len(proxy_list)
url = base_url + word.lower().replace("'", '-') + "?q=" + word
#print url
try:
response = requests.get(url, proxies=proxy_dict, headers=header_dict)
except KeyboardInterrupt:
return ""
except:
print '\033[91m' + "Connection error with " + proxy_dict['http'] + " " + header_dict['user-agent'] + '\033[0m'
continue
#sleep(randint(1,10))
#print response.content
tree = html.fromstring(response.content)
transcribed_words = []
american = tree.xpath("//div[@data-tab='ds-american-english']")
if type(american) is list:
if len(american) > 0:
american = american[0]
else:
continue
#print american
alternatives = american.xpath(".//div[contains(@class, 'entry-body__el')]")
for alternative in alternatives:
transcriptions = alternative.xpath(".//span[@class='pron']")
if len(transcriptions) == 0:
continue
transcriptions = transcriptions[0].xpath(".//span[contains(@class, 'ipa')]/text()")
transcription = "/" + (", ".join(transcriptions)) + "/"
audio_url = alternative.xpath(".//span[@data-src-mp3]/@data-src-mp3")[0]
transcription_types = alternative.xpath(".//span[contains(@class, 'posgram')]")
if len(transcription_types) > 0:
transcription_types = transcription_types[0].xpath(".//span[@class='pos']/text()")
else:
transcription_types = []
found = False
for transcribed_word in transcribed_words:
#print transcribed_word[1]
#print transcription_types
if transcribed_word[0] == transcription:
transcribed_word[1] = list(set(transcribed_word[1]).union(set(transcription_types)))
#print transcribed_word[0]
#print transcribed_word[1]
found = True
if not found:
#print transcription
#print transcription_types
transcribed_words.append([transcription, transcription_types, audio_url])
#print transcribed_words
#return ""
transcriptions = []
for transcribed_word in transcribed_words:
transcriptions.append(transcribed_word[0] + ' ' + (", ".join(transcribed_word[1])) + '\t' + transcribed_word[2])
return " | ".join(transcriptions)
random.shuffle(proxy_list)
with open('words-list.txt') as lines:
payload = ''
for word in lines:
word = word.replace('\n', '')
transcription = get_transcription(word)
if transcription:
print word + '\t' + transcription
else:
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment