Created
August 12, 2017 16:42
-
-
Save rraallvv/61db560e57617096c9aba36b92e37a53 to your computer and use it in GitHub Desktop.
Cambridge Dictionary IPA phonetic transcription
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import requests | |
from lxml import html | |
import re | |
from random import randint | |
from time import sleep | |
import sys | |
import random | |
import fake_useragent | |
from fake_useragent import UserAgent | |
base_url='http://dictionary.cambridge.org/dictionary/english/' | |
proxy_list = ['183.111.169.207:3128', '54.157.185.100:10000', '185.92.220.84:3128', '91.221.61.126:3128', '161.68.250.139:80', '183.111.169.203:3128', '200.35.187.114:8080', '208.85.182.130:8080', '52.33.65.51:80', '207.28.38.2:3128', '12.207.13.20:3128', '173.220.170.242:7004', '69.143.93.173:3128', '107.170.232.185:80', '119.52.160.2:3128', '124.192.17.123:3128', '107.170.232.185:8080', '1.234.45.50:3128', '118.26.143.202:3128', '92.46.122.98:3128', '221.199.203.70:3128', '107.151.152.218:80', '60.191.153.12:3128', '118.144.187.254:3128', '118.163.108.104:3128'] | |
current_proxy = 0 | |
header_dict = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'} | |
#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined | |
def get_transcription(word): | |
global current_proxy | |
response = {} | |
while True: | |
proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]} | |
header_dict['user-agent'] = UserAgent().random | |
current_proxy = (current_proxy + 1) % len(proxy_list) | |
url = base_url + word.lower().replace("'", '-') + "?q=" + word | |
#print url | |
try: | |
response = requests.get(url, proxies=proxy_dict, headers=header_dict) | |
except KeyboardInterrupt: | |
return "" | |
except: | |
print '\033[91m' + "Connection error with " + proxy_dict['http'] + " " + header_dict['user-agent'] + '\033[0m' | |
continue | |
#sleep(randint(1,10)) | |
#print response.content | |
tree = html.fromstring(response.content) | |
transcribed_words = [] | |
american = tree.xpath("//div[@data-tab='ds-american-english']") | |
if type(american) is list: | |
if len(american) > 0: | |
american = american[0] | |
else: | |
continue | |
#print american | |
alternatives = american.xpath(".//div[contains(@class, 'entry-body__el')]") | |
for alternative in alternatives: | |
transcriptions = alternative.xpath(".//span[@class='pron']") | |
if len(transcriptions) == 0: | |
continue | |
transcriptions = transcriptions[0].xpath(".//span[contains(@class, 'ipa')]/text()") | |
transcription = "/" + (", ".join(transcriptions)) + "/" | |
audio_url = alternative.xpath(".//span[@data-src-mp3]/@data-src-mp3")[0] | |
transcription_types = alternative.xpath(".//span[contains(@class, 'posgram')]") | |
if len(transcription_types) > 0: | |
transcription_types = transcription_types[0].xpath(".//span[@class='pos']/text()") | |
else: | |
transcription_types = [] | |
found = False | |
for transcribed_word in transcribed_words: | |
#print transcribed_word[1] | |
#print transcription_types | |
if transcribed_word[0] == transcription: | |
transcribed_word[1] = list(set(transcribed_word[1]).union(set(transcription_types))) | |
#print transcribed_word[0] | |
#print transcribed_word[1] | |
found = True | |
if not found: | |
#print transcription | |
#print transcription_types | |
transcribed_words.append([transcription, transcription_types, audio_url]) | |
#print transcribed_words | |
#return "" | |
transcriptions = [] | |
for transcribed_word in transcribed_words: | |
transcriptions.append(transcribed_word[0] + ' ' + (", ".join(transcribed_word[1])) + '\t' + transcribed_word[2]) | |
return " | ".join(transcriptions) | |
random.shuffle(proxy_list) | |
with open('words-list.txt') as lines: | |
payload = '' | |
for word in lines: | |
word = word.replace('\n', '') | |
transcription = get_transcription(word) | |
if transcription: | |
print word + '\t' + transcription | |
else: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment