Skip to content

Instantly share code, notes, and snippets.

@branw
Created April 27, 2020 17:40
Show Gist options
  • Save branw/0da4635245453809f3a937f7187806dc to your computer and use it in GitHub Desktop.
Save branw/0da4635245453809f3a937f7187806dc to your computer and use it in GitHub Desktop.
Web app interface for scraping Chegg textbook solutions (Sept. 2018)
import requests
from requests.auth import HTTPBasicAuth
from pprint import pprint
import secrets
import uuid
from urllib.parse import urlparse
from flask import Flask, jsonify, render_template, request, abort, redirect
from collections import deque
s = requests.Session()
s.auth = HTTPBasicAuth('hlDpZAPF05mqjAmg7cqtIKLOhUryB8p1', 'uBjzakmxGx6WtqAr')
s.headers.update({
'X-CHEGG-DEVICEID': secrets.token_hex(8),
'X-CHEGG-SESSIONID': str(uuid.uuid4()),
#'X-CHEGG-XYZPASS': '1',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; Pixel XL Build/PPR1.180610.009)'
})
tbs_recent_books = deque(maxlen=10)
tbs_books = {}
tbs_chapters = {}
tbs_problems = {}
def get_tbs_books(query):
r = s.get('https://hub.chegg.com/v1/book', params={
'q': query, 'f.hasSolutions': true
})
r.raise_for_status()
return r.json()
def get_tbs_book(book_id):
if book_id in tbs_books:
return tbs_books[book_id]
output = []
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}')
if r.status_code == 404:
return None
r.raise_for_status()
j = r.json()
result = j['result']
output = {
'id': book_id,
'name': result['title'],
'full_name': result['fullTitle'],
'edition': result['edition'],
'image': result['imgLarge'] if 'imgLarge' in result else result['imgThumb'],
'has_solutions': result['hasSolutions']
}
if book_id not in tbs_recent_books:
tbs_recent_books.append(book_id)
tbs_books[book_id] = output
return output
def get_tbs_chapters(book_id, offset=0, all=True):
if book_id in tbs_chapters:
return tbs_chapters[book_id]
output = []
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}/chapters', params={
'offset': offset
})
r.raise_for_status()
j = r.json()
output.extend(j['result'])
while all and 'nextPage' in j:
r = s.get(j['nextPage'])
r.raise_for_status()
j = r.json()
output.extend(j['result'])
tbs_chapters[book_id] = output
return output
def get_tbs_problems(chapter_id, offset=0, all=True):
if chapter_id in tbs_problems:
return tbs_problems[chapter_id]
output = []
r = s.get(f'https://hub.chegg.com/v1/chapter/{chapter_id}/problems', params={
'offset': offset
})
r.raise_for_status()
j = r.json()
output.extend(j['result'])
while all and 'nextPage' in j:
r = s.get(j['nextPage'])
r.raise_for_status()
j = r.json()
output.extend(j['result'])
tbs_problems[chapter_id] = output
return output
def get_tbs_problem_text(problem_id):
r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html')
if r.status_code == 404:
return None
r.raise_for_status()
return r.text
def get_tbs_solutions(problem_id):
r = s.get(f'https://hub.chegg.com/v1/problem/{problem_id}/solutions')
r.raise_for_status()
j = r.json()
return j['result']
def load_solutions(problem_id):
solutions = get_tbs_solutions(problem_id)
output = []
for solution in solutions:
solution_output = []
steps = solutions[0]['steps']
for i, step in enumerate(steps):
r = s.get(step['link'])
solution_output.append({
'i': i + 1,
'text': r.text
})
output.append({
'num_steps': len(steps),
'steps': solution_output
})
return output
def load_problems(chapter_id):
problems = get_tbs_problems(chapter_id)
output = []
for problem in problems:
output.append({
'name': problem['name'],
'id': problem['id']
})
return output
def load_chapters(book_id):
chapters = get_tbs_chapters(book_id)
output = []
for chapter in chapters:
output.append({
'name': chapter['name'],
'id': chapter['id']
})
return output
app = Flask(__name__)
@app.route('/book/<int:book_id>/chapters')
def get_chapters(book_id):
return jsonify(get_tbs_chapters(book_id))
@app.route('/chapter/<int:chapter_id>/problems')
def get_problems(chapter_id):
return jsonify(get_tbs_problems(chapter_id))
@app.route('/problem/<int:problem_id>')
def get_problem(problem_id):
r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html')
return r.text if r.status_code is 200 else ''
@app.route('/problem/<int:problem_id>/solutions')
def get_solutions(problem_id):
return jsonify(load_solutions(problem_id))
@app.route('/')
def request_index():
recent_books = [tbs_books[book_id] for book_id in tbs_recent_books]
return render_template('chegg.html', recent_books=recent_books)
@app.route('/query', methods=['POST'])
def request_query():
query = request.form['query']
path = urlparse(query).path
if path.startswith('/homework-help/questions-and-answers/'):
question_id = path.split('q')[-1]
return redirect(f'/qna/{question_id}', 302)
elif path.startswith('/homework-help/'):
book_id = path.split('-')[-1]
return redirect(f'/tbs/{book_id}', 302)
abort(501)
@app.route('/tbs/<int:book_id>')
def request_tbs_book(book_id):
current = {
'book': {
'id': book_id
}
}
return render_template('chegg.html', current=current, book=get_tbs_book(book_id),
chapters=load_chapters(book_id))
@app.route('/tbs/<int:book_id>/<int:chapter_id>')
def request_tbs_chapter(book_id, chapter_id):
chapters = get_tbs_chapters(book_id)
chapter_info = next((item for item in chapters if item['id'] == str(chapter_id)))
current = {
'chapter': {
'name': chapter_info['name'],
'id': chapter_id
},
'book': {
'id': book_id
}
}
return render_template('chegg.html', current=current, book=get_tbs_book(book_id),
chapters=load_chapters(book_id), problems=load_problems(chapter_id))
@app.route('/tbs/<int:book_id>/<int:chapter_id>/<int:problem_id>')
def request_tbs_problem(book_id, chapter_id, problem_id):
problems = get_tbs_problems(chapter_id)
problem_info = next((item for item in problems if item['id'] == str(problem_id)))
problem_text = get_tbs_problem_text(problem_id)
chapters = get_tbs_chapters(book_id)
chapter_info = next((item for item in chapters if item['id'] == str(chapter_id)))
solutions = load_solutions(problem_id)
current = {
'problem': {
'name': problem_info['name'],
'id': problem_id
},
'chapter': {
'name': chapter_info['name'],
'id': chapter_id
},
'book': {
'id': book_id
}
}
return render_template('chegg.html', current=current, book=get_tbs_book(book_id),
chapters=load_chapters(book_id), problems=load_problems(chapter_id),
problem_text=problem_text, solutions=solutions)
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Chegg</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script>
</head>
<body class="bg-light">
<div class="container">
<div class="py-5 text-center">
<h6><a href="/">⁣<pre>
πŸ“”πŸ“šβ€‚β€‚β€‚β€‚β€‚β€‚β€‚β€‚β€‚β€‚β€‚πŸ“š
πŸ““πŸ“šπŸ“–β€‚β€‚πŸ˜«β€‚β€‚πŸ“šπŸ“šπŸ““
πŸ“•πŸ“šπŸ“šβ€‚β€‚πŸ“β€‚β€‚πŸ“—πŸ’»πŸ“˜
πŸ“–β£πŸ“–πŸ“–πŸ“–πŸ“–πŸ“–πŸ“–πŸ“–πŸ“–
Doing my homework!
βœοΈπŸ“βœοΈπŸ“βœοΈπŸ“βœοΈπŸ“
</pre></a></h6>
</div>
<div class="row">
<div class="col-md-4">
<form class="card p-2 list-group mb-3" action="/query" method="post">
<div class="input-group">
<input class="form-control" name="query" placeholder="Chegg URL" type="text">
<div class="input-group-append">
<button type="submit" class="btn btn-secondary">Load</button>
</div>
</div>
</form>
{% if book %}
<div class="card mb-3">
<div class="row no-gutters">
<div class="col-4">
<img src="{{ book.image }}" class="img-fluid" alt="">
</div>
<div class="col">
<div class="card-body px-2">
<h6 class="card-title">{{ book.full_name }}</h6>
</div>
</div>
</div>
</div>
{% endif %}
{% if chapters %}
<ul class="list-group mb-3">
<li class="list-group-item justify-content-between lh-condensed">
<h5>Chapters</h5>
<p>
{% for chapter in chapters %}
<a href="/tbs/{{ current.book.id }}/{{ chapter.id }}" class="badge {% if current and current.chapter and current.chapter.id|string == chapter.id|string %}badge-dark{% else %}badge-light{% endif %}">{{ chapter.name }}</a>
{% endfor %}
</p>
</li>
</ul>
{% endif %}
{% if problems %}
<ul class="list-group mb-3">
<li class="list-group-item justify-content-between lh-condensed">
<h5>Problems</h5>
<p>
{% for problem in problems %}
<a href="/tbs/{{ current.book.id }}/{{ current.chapter.id }}/{{ problem.id }}" class="badge {% if current and current.problem and current.problem.id|string == problem.id|string %}badge-dark{% else %}badge-light{% endif %}">{{ problem.name }}</a>
{% endfor %}
</p>
</li>
</ul>
{% endif %}
</div>
<div class="col-md-8">
{% if not current %}
<div class="card">
<div class="card-body">
<h2>Recent Books</h2>
<div class="card-columns">
{% for book in recent_books %}
<div class="card">
<a href="/tbs/{{ book.id }}"><img class="card-img-top" src="{{ book.image }}" title="{{ book.full_name }}"></a>
</div>
{% endfor %}
</div>
</div>
</div>
{% endif %}
{% if current and current.problem %}
<ul class="list-group mb-3">
<li class="list-group-item justify-content-between">
<div>
<h5>Chapter {{ current.chapter.name }}, Problem {{ current.problem.name }}</h5>
</div>
{% if problem_text != None %}
{{ problem_text | safe }}
{% endif %}
</li>
</ul>
{% for solution in solutions %}
<ul class="list-group mb-3">
{% for step in solution.steps %}
<li class="list-group-item justify-content-between">
<div>
<h5>Step {{ step.i }} <span class="text-muted">of {{ solution.num_steps }}</span></h5>
</div>
{{ step.text | safe }}
</li>
{% endfor %}
</ul>
{% endfor %}
{% endif %}
</div>
</div>
</div>
</body>
</html>
@WeakCode777
Copy link

can you provide a step by step sir on how to setup this ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment