Skip to content

Instantly share code, notes, and snippets.

@branw
Last active May 11, 2020 18:25
Show Gist options
  • Save branw/195a6f1a34f3068ba63d9ebfaca17b47 to your computer and use it in GitHub Desktop.
Save branw/195a6f1a34f3068ba63d9ebfaca17b47 to your computer and use it in GitHub Desktop.
Chegg textbook solutions scraper (Sept. 2018) -- exploited client-side enforcement of trial mode on old Android app (hardcoded account is now deleted)
import requests
from requests.auth import HTTPBasicAuth
from pprint import pprint
import secrets
import uuid
from urllib.parse import urlparse
from collections import deque
import pickle
import sys
s = requests.Session()
s.auth = HTTPBasicAuth('hlDpZAPF05mqjAmg7cqtIKLOhUryB8p1', 'uBjzakmxGx6WtqAr')
s.headers.update({
'X-CHEGG-DEVICEID': secrets.token_hex(8),
'X-CHEGG-SESSIONID': str(uuid.uuid4()),
'X-CHEGG-XYZPASS': '1',
'User-Agent': 'Dalvik/2.1.0 (Linux; U; Android 9; Pixel XL Build/PPR1.180610.009)'
})
def get_tbs_book(book_id):
output = []
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}')
if r.status_code == 404:
return None
r.raise_for_status()
j = r.json()
result = j['result']
output = {
'id': book_id,
'name': result['title'],
'full_name': result['fullTitle'],
'edition': result['edition'],
'image': result['imgLarge'] if 'imgLarge' in result else result['imgThumb'],
'has_solutions': result['hasSolutions']
}
return output
def get_tbs_chapters(book_id, offset=0, all=True):
output = []
r = s.get(f'https://hub.chegg.com/v1/book/{book_id}/chapters', params={
'offset': offset
})
r.raise_for_status()
j = r.json()
output.extend(j['result'])
while all and 'nextPage' in j:
r = s.get(j['nextPage'])
r.raise_for_status()
j = r.json()
output.extend(j['result'])
return output
def get_tbs_problems(chapter_id, offset=0, all_problems=True):
output = []
r = s.get(f'https://hub.chegg.com/v1/chapter/{chapter_id}/problems', params={
'offset': offset
})
r.raise_for_status()
j = r.json()
output.extend(j['result'])
while all_problems and 'nextPage' in j:
r = s.get(j['nextPage'])
r.raise_for_status()
j = r.json()
output.extend(j['result'])
return output
def get_tbs_problem_text(problem_id):
r = s.get(f'https://hub.chegg.com/content/tbs-problem/{problem_id}.html')
if r.status_code == 404:
return None
r.raise_for_status()
return r.text
def get_tbs_solutions(problem_id):
r = s.get(f'https://hub.chegg.com/v1/problem/{problem_id}/solutions')
r.raise_for_status()
j = r.json()
return j['result']
def load_solutions(problem_id):
solutions = get_tbs_solutions(problem_id)
output = []
for solution in solutions:
solution_output = []
steps = solutions[0]['steps']
for i, step in enumerate(steps):
r = s.get(step['link'])
solution_output.append({
'i': i + 1,
'text': r.text
})
output.append({
'num_steps': len(steps),
'steps': solution_output
})
return output
def load_problems(chapter_id):
problems = get_tbs_problems(chapter_id)
output = []
for problem in problems:
output.append({
'name': problem['name'],
'id': problem['id']
})
return output
def load_chapters(book_id):
chapters = get_tbs_chapters(book_id)
output = []
for chapter in chapters:
output.append({
'name': chapter['name'],
'id': chapter['id']
})
return output
if __name__ == '__main__':
book_id = REDACTED
all_solutions = {}
book = get_tbs_book(book_id)
chapters = load_chapters(book_id)
output = f"<section><img src='{book['image']}'><h1>{book['full_name']}</h1><p>" + ' '.join(f"<a href='#{chapter['id']}'>{chapter['name']}</a>" for chapter in chapters) + '</p></section>'
for i, chapter in enumerate(chapters):
print(i, 'Chapter', chapter['name'])
problems = load_problems(chapter['id'])
output += f"<section id='{chapter['id']}'><h2>Chapter {chapter['name']}</h2><p>" + ' '.join(f"<a href='#{chapter['id']}-{problem['id']}'>{problem['name']}</a>" for problem in problems) + "</p></section>"
for j, problem in enumerate(problems):
print(i, j, 'Problem', problem['name'])
solutions = load_solutions(problem['id'])
if not any(solutions):
continue
solution = solutions[0]
all_solutions[problem['id']] = solution
output += f"<section id='{chapter['id']}-{problem['id']}'><a href='#{chapter['id']}'>Go to Chapter {chapter['name']}</a><h3>Problem {problem['name']}</h3>"
output += "<ul class='list-group mb-3'>" + ''.join(f"<li class='list-group-item justify-content-between'><h4>Step {step['i']} <span class='text-muted'>of {solution['num_steps']}</span></h4>{step['text']}</li>" for step in solution['steps']) + "</ul>"
output += "</section>"
html_output = """<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
<title>Chegg</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/css/bootstrap.min.css" integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.1.3/js/bootstrap.min.js" integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy" crossorigin="anonymous"></script>
<style>
:root { font-size: 18px; }
section { page-break-after: always; }
</style>
</head>
<body class="bg-light">
""" + output + """
</body>
</html>
"""
title = book['name'].lower().replace(' ', '-')
with open(f'book-{book_id}-{title}.html', 'w') as f:
f.write(html_output.encode(sys.stdout.encoding, errors='replace'))
pickle.dump(all_solutions, open('book.pickle', 'wb'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment