kelciour/cambridge.py Secret

## cambridge.py
# -*- coding: utf-8

import codecs
import re
import requests
import shutil
import sys

from datetime import datetime, timedelta
from lxml import html
from lxml import etree
from lxml.cssselect import CSSSelector

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
}

def get_definition(url):
	r = requests.get(url, headers=headers)

	definition_regex = r'<span class="posgram ico-bg"><span [^>]+>([^<]+)</span>[\s\S]*?(<div class="pos-body">[\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
	m = re.search(definition_regex, r.text)
	word = ""
	if not m:
		definition_regex = r'<span class="posgram"><span[^>]+>([^<]+)</span>[\s\S]*?<div class="di-body[^>]+>([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
		m = re.search(definition_regex, r.text)
		if not m:
			definition_regex = r'<div class="di-body[^>]+>[\s\S]*?<span class="pos"[^>]+>(.*?)</span>[\s\S]*?<div class="pos-body">([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
			m = re.search(definition_regex, r.text)
			if not m:
				definition_regex = r'<div class="di-body[^>]+>[\s\S]*?(<div class="idiom-block">[\s\S]*?</div>)\s*(?:</div>|</span>)\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
				m = re.search(definition_regex, r.text)

				if not m:
					definition = " "
				else:
					word = "idiom"
					definition = '<div class="pos-body">' + m.group(1) + '</div>'
			else:
				word = m.group(1)
				definition =  '<div class="pos-body">' + m.group(2) + '</div>'
		else:
			word = m.group(1)
			definition = m.group(2)
	else:
		word = m.group(1)
		definition = m.group(2)

	definition = re.sub(r'<a class="query" href=[^>]+>([^<]+)</a>', '\\1', definition)
	definition = re.sub(r'\s+', ' ', definition)
	definition = definition.strip()
	word = word.strip()

	if '<div class="smartt">' in definition:
		definition = ""

	return (word, definition)


with codecs.open('log.txt', 'w', 'utf-8') as log_file, codecs.open('Word of the Day.txt', 'r', 'utf-8') as anki_file:
	for line in anki_file:
		base_word = line.split('\t')[0].replace('&#039;', "'")
		word_part = line.split('\t')[2].strip()
		print base_word.encode('ascii', 'ignore')

		url = "http://dictionary.cambridge.org/search/english-russian/direct/?q=" + base_word.replace(' ', '+')
		word, definition = get_definition(url)

		if word == word_part or word_part not in ["adjective", "noun", "verb", "adverb", "pronoun"]:
			log_file.write(definition + "\n")
		elif len(definition) == 0:
			log_file.write("\n")
		else:
			r = requests.get(url, headers=headers)
			url_regex = r'<div class="oflow-hide">[\s\S]*<li><a href=("([^"]+)"[\s\S]*?)<b class="hw"[^>]+>%s</b></span>\s*?<span class="pos">%s</span>' % (base_word, word_part)
			m = re.search(url_regex, r.text)
			if not m:
				log_file.write("<p>Different Part of Speech:</p>" + definition + "\n")
			else:
				s = m.group(1)
				if "<li>" not in s:
					url = m.group(2)
					word, definition = get_definition(url)
					log_file.write(definition + "\n")
				else:
					log_file.write("LI" + "\n")
	# -*- coding: utf-8

	import codecs
	import re
	import requests
	import shutil
	import sys

	from datetime import datetime, timedelta
	from lxml import html
	from lxml import etree
	from lxml.cssselect import CSSSelector

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
	}

	def get_definition(url):
	r = requests.get(url, headers=headers)

	definition_regex = r'<span class="posgram ico-bg"><span [^>]+>([^<]+)</span>[\s\S]?(<div class="pos-body">[\s\S]?)</div>\s</div>\s</div>\s</div>\s<div class="definition-src">'
	m = re.search(definition_regex, r.text)
	word = ""
	if not m:
	definition_regex = r'<span class="posgram"><span[^>]+>([^<]+)</span>[\s\S]?<div class="di-body[^>]+>([\s\S]?)</div>\s</div>\s</div>\s</div>\s<div class="definition-src">'
	m = re.search(definition_regex, r.text)
	if not m:
	definition_regex = r'<div class="di-body[^>]+>[\s\S]?<span class="pos"[^>]+>(.?)</span>[\s\S]?<div class="pos-body">([\s\S]?)</div>\s</div>\s</div>\s</div>\s<div class="definition-src">'
	m = re.search(definition_regex, r.text)
	if not m:
	definition_regex = r'<div class="di-body[^>]+>[\s\S]?(<div class="idiom-block">[\s\S]?</div>)\s(?:</div>\|</span>)\s</div>\s</div>\s</div>\s</div>\s</div>\s</div>\s<div class="definition-src">'
	m = re.search(definition_regex, r.text)

	if not m:
	definition = " "
	else:
	word = "idiom"
	definition = '<div class="pos-body">' + m.group(1) + '</div>'
	else:
	word = m.group(1)
	definition = '<div class="pos-body">' + m.group(2) + '</div>'
	else:
	word = m.group(1)
	definition = m.group(2)
	else:
	word = m.group(1)
	definition = m.group(2)

	definition = re.sub(r'<a class="query" href=[^>]+>([^<]+)</a>', '\\1', definition)
	definition = re.sub(r'\s+', ' ', definition)
	definition = definition.strip()
	word = word.strip()

	if '<div class="smartt">' in definition:
	definition = ""

	return (word, definition)


	with codecs.open('log.txt', 'w', 'utf-8') as log_file, codecs.open('Word of the Day.txt', 'r', 'utf-8') as anki_file:
	for line in anki_file:
	base_word = line.split('\t')[0].replace(''', "'")
	word_part = line.split('\t')[2].strip()
	print base_word.encode('ascii', 'ignore')

	url = "http://dictionary.cambridge.org/search/english-russian/direct/?q=" + base_word.replace(' ', '+')
	word, definition = get_definition(url)

	if word == word_part or word_part not in ["adjective", "noun", "verb", "adverb", "pronoun"]:
	log_file.write(definition + "\n")
	elif len(definition) == 0:
	log_file.write("\n")
	else:
	r = requests.get(url, headers=headers)
	url_regex = r'<div class="oflow-hide">[\s\S]<li><a href=("([^"]+)"[\s\S]?)<b class="hw"[^>]+>%s</b></span>\s*?<span class="pos">%s</span>' % (base_word, word_part)
	m = re.search(url_regex, r.text)
	if not m:
	log_file.write("<p>Different Part of Speech:</p>" + definition + "\n")
	else:
	s = m.group(1)
	if "<li>" not in s:
	url = m.group(2)
	word, definition = get_definition(url)
	log_file.write(definition + "\n")
	else:
	log_file.write("LI" + "\n")