Skip to content

Instantly share code, notes, and snippets.

Created August 12, 2017 16:42
Show Gist options
  • Save rraallvv/61db560e57617096c9aba36b92e37a53 to your computer and use it in GitHub Desktop.
Save rraallvv/61db560e57617096c9aba36b92e37a53 to your computer and use it in GitHub Desktop.
Cambridge Dictionary IPA phonetic transcription
import requests
from lxml import html
import re
from random import randint
from time import sleep
import sys
import random
import fake_useragent
from fake_useragent import UserAgent
proxy_list = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
current_proxy = 0
header_dict = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}
#text_to_transcribe=impress&submit=Show+transcription&output_dialect=am&output_style=only_tr&weak_forms=on&preBracket=&postBracket=&speech_support=1: undefined
def get_transcription(word):
global current_proxy
response = {}
while True:
proxy_dict = {'http' : 'http://' + proxy_list[current_proxy]}
header_dict['user-agent'] = UserAgent().random
current_proxy = (current_proxy + 1) % len(proxy_list)
url = base_url + word.lower().replace("'", '-') + "?q=" + word
#print url
response = requests.get(url, proxies=proxy_dict, headers=header_dict)
except KeyboardInterrupt:
return ""
print '\033[91m' + "Connection error with " + proxy_dict['http'] + " " + header_dict['user-agent'] + '\033[0m'
#print response.content
tree = html.fromstring(response.content)
transcribed_words = []
american = tree.xpath("//div[@data-tab='ds-american-english']")
if type(american) is list:
if len(american) > 0:
american = american[0]
#print american
alternatives = american.xpath(".//div[contains(@class, 'entry-body__el')]")
for alternative in alternatives:
transcriptions = alternative.xpath(".//span[@class='pron']")
if len(transcriptions) == 0:
transcriptions = transcriptions[0].xpath(".//span[contains(@class, 'ipa')]/text()")
transcription = "/" + (", ".join(transcriptions)) + "/"
audio_url = alternative.xpath(".//span[@data-src-mp3]/@data-src-mp3")[0]
transcription_types = alternative.xpath(".//span[contains(@class, 'posgram')]")
if len(transcription_types) > 0:
transcription_types = transcription_types[0].xpath(".//span[@class='pos']/text()")
transcription_types = []
found = False
for transcribed_word in transcribed_words:
#print transcribed_word[1]
#print transcription_types
if transcribed_word[0] == transcription:
transcribed_word[1] = list(set(transcribed_word[1]).union(set(transcription_types)))
#print transcribed_word[0]
#print transcribed_word[1]
found = True
if not found:
#print transcription
#print transcription_types
transcribed_words.append([transcription, transcription_types, audio_url])
#print transcribed_words
#return ""
transcriptions = []
for transcribed_word in transcribed_words:
transcriptions.append(transcribed_word[0] + ' ' + (", ".join(transcribed_word[1])) + '\t' + transcribed_word[2])
return " | ".join(transcriptions)
with open('words-list.txt') as lines:
payload = ''
for word in lines:
word = word.replace('\n', '')
transcription = get_transcription(word)
if transcription:
print word + '\t' + transcription
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment