Skip to content

Instantly share code, notes, and snippets.

@kelciour
Created September 30, 2017 15:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kelciour/8b0b8ecc02ee91a83ab2665c0c7dcfc0 to your computer and use it in GitHub Desktop.
Save kelciour/8b0b8ecc02ee91a83ab2665c0c7dcfc0 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8
import codecs
import re
import requests
import shutil
import sys
from datetime import datetime, timedelta
from lxml import html
from lxml import etree
from lxml.cssselect import CSSSelector
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36",
}
def get_definition(url):
r = requests.get(url, headers=headers)
definition_regex = r'<span class="posgram ico-bg"><span [^>]+>([^<]+)</span>[\s\S]*?(<div class="pos-body">[\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
m = re.search(definition_regex, r.text)
word = ""
if not m:
definition_regex = r'<span class="posgram"><span[^>]+>([^<]+)</span>[\s\S]*?<div class="di-body[^>]+>([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
m = re.search(definition_regex, r.text)
if not m:
definition_regex = r'<div class="di-body[^>]+>[\s\S]*?<span class="pos"[^>]+>(.*?)</span>[\s\S]*?<div class="pos-body">([\s\S]*?)</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
m = re.search(definition_regex, r.text)
if not m:
definition_regex = r'<div class="di-body[^>]+>[\s\S]*?(<div class="idiom-block">[\s\S]*?</div>)\s*(?:</div>|</span>)\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*</div>\s*<div class="definition-src">'
m = re.search(definition_regex, r.text)
if not m:
definition = " "
else:
word = "idiom"
definition = '<div class="pos-body">' + m.group(1) + '</div>'
else:
word = m.group(1)
definition = '<div class="pos-body">' + m.group(2) + '</div>'
else:
word = m.group(1)
definition = m.group(2)
else:
word = m.group(1)
definition = m.group(2)
definition = re.sub(r'<a class="query" href=[^>]+>([^<]+)</a>', '\\1', definition)
definition = re.sub(r'\s+', ' ', definition)
definition = definition.strip()
word = word.strip()
if '<div class="smartt">' in definition:
definition = ""
return (word, definition)
with codecs.open('log.txt', 'w', 'utf-8') as log_file, codecs.open('Word of the Day.txt', 'r', 'utf-8') as anki_file:
for line in anki_file:
base_word = line.split('\t')[0].replace('&#039;', "'")
word_part = line.split('\t')[2].strip()
print base_word.encode('ascii', 'ignore')
url = "http://dictionary.cambridge.org/search/english-russian/direct/?q=" + base_word.replace(' ', '+')
word, definition = get_definition(url)
if word == word_part or word_part not in ["adjective", "noun", "verb", "adverb", "pronoun"]:
log_file.write(definition + "\n")
elif len(definition) == 0:
log_file.write("\n")
else:
r = requests.get(url, headers=headers)
url_regex = r'<div class="oflow-hide">[\s\S]*<li><a href=("([^"]+)"[\s\S]*?)<b class="hw"[^>]+>%s</b></span>\s*?<span class="pos">%s</span>' % (base_word, word_part)
m = re.search(url_regex, r.text)
if not m:
log_file.write("<p>Different Part of Speech:</p>" + definition + "\n")
else:
s = m.group(1)
if "<li>" not in s:
url = m.group(2)
word, definition = get_definition(url)
log_file.write(definition + "\n")
else:
log_file.write("LI" + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment