-
-
Save kelciour/8b0b8ecc02ee91a83ab2665c0c7dcfc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 | |
import codecs | |
import re | |
import requests | |
import shutil | |
import sys | |
from datetime import datetime, timedelta | |
from lxml import html | |
from lxml import etree | |
from lxml.cssselect import CSSSelector | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36", | |
} | |
def get_definition(url): | |
r = requests.get(url, headers=headers) | |
definition_regex = r'<span class="posgram ico-bg"><span [^>]+>([^<]+)</span>[\s\S]*?(<div class="pos-body">[\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">' | |
m = re.search(definition_regex, r.text) | |
word = "" | |
if not m: | |
definition_regex = r'<span class="posgram"><span[^>]+>([^<]+)</span>[\s\S]*?<div class="di-body[^>]+>([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">' | |
m = re.search(definition_regex, r.text) | |
if not m: | |
definition_regex = r'<div class="di-body[^>]+>[\s\S]*?<span class="pos"[^>]+>(.*?)</span>[\s\S]*?<div class="pos-body">([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">' | |
m = re.search(definition_regex, r.text) | |
if not m: | |
definition_regex = r'<div class="di-body[^>]+>[\s\S]*?(<div class="idiom-block">[\s\S]*?</div>)\s*(?:</div>|</span>)\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">' | |
m = re.search(definition_regex, r.text) | |
if not m: | |
definition = " " | |
else: | |
word = "idiom" | |
definition = '<div class="pos-body">' + m.group(1) + '</div>' | |
else: | |
word = m.group(1) | |
definition = '<div class="pos-body">' + m.group(2) + '</div>' | |
else: | |
word = m.group(1) | |
definition = m.group(2) | |
else: | |
word = m.group(1) | |
definition = m.group(2) | |
definition = re.sub(r'<a class="query" href=[^>]+>([^<]+)</a>', '\\1', definition) | |
definition = re.sub(r'\s+', ' ', definition) | |
definition = definition.strip() | |
word = word.strip() | |
if '<div class="smartt">' in definition: | |
definition = "" | |
return (word, definition) | |
with codecs.open('log.txt', 'w', 'utf-8') as log_file, codecs.open('Word of the Day.txt', 'r', 'utf-8') as anki_file: | |
for line in anki_file: | |
base_word = line.split('\t')[0].replace(''', "'") | |
word_part = line.split('\t')[2].strip() | |
print base_word.encode('ascii', 'ignore') | |
url = "http://dictionary.cambridge.org/search/english-russian/direct/?q=" + base_word.replace(' ', '+') | |
word, definition = get_definition(url) | |
if word == word_part or word_part not in ["adjective", "noun", "verb", "adverb", "pronoun"]: | |
log_file.write(definition + "\n") | |
elif len(definition) == 0: | |
log_file.write("\n") | |
else: | |
r = requests.get(url, headers=headers) | |
url_regex = r'<div class="oflow-hide">[\s\S]*<li><a href=("([^"]+)"[\s\S]*?)<b class="hw"[^>]+>%s</b></span>\s*?<span class="pos">%s</span>' % (base_word, word_part) | |
m = re.search(url_regex, r.text) | |
if not m: | |
log_file.write("<p>Different Part of Speech:</p>" + definition + "\n") | |
else: | |
s = m.group(1) | |
if "<li>" not in s: | |
url = m.group(2) | |
word, definition = get_definition(url) | |
log_file.write(definition + "\n") | |
else: | |
log_file.write("LI" + "\n") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment