Skip to content

Instantly share code, notes, and snippets.

@calpt
Last active September 8, 2021 13:40
Show Gist options
  • Save calpt/73f9cb8be6aaf1dbc780b05a9e971e92 to your computer and use it in GitHub Desktop.
Save calpt/73f9cb8be6aaf1dbc780b05a9e971e92 to your computer and use it in GitHub Desktop.
TU Moodle scraper
"""
A simple module providing scraping utilities for Moodle.
Use `python moodle.py -h` to show available command-line options.
Moodle web service API (official):
https://docs.moodle.org/dev/Web_service_API_functions
API method docu (not really official):
https://learn.cineca.it/pluginfile.php/1/theme_adaptable/adaptablemarketingimages/0/api.htm
"""
import argparse
import csv
import gender_guesser.detector as gender
import json
from lxml import html
import re
import requests
import sys
import tqdm
from urllib.parse import urljoin
# url of the moodle instance
MOODLE_URL="https://moodle.tu-darmstadt.de"
# url of the sso login page for this moodle instance
SSO_URL="https://sso.tu-darmstadt.de/login?service=https%3A%2F%2Fmoodle.tu-darmstadt.de%2Flogin%2Findex.php"
class Moodle:
def __init__(self):
self.url = MOODLE_URL
self.session = requests.Session()
self.config = None
self.service_page = "lib/ajax/service.php"
def _get_login_execution_value(self, url):
resp = self.session.get(url)
tree = html.fromstring(resp.text)
execution_id = tree.xpath("//form[@id='fm1']//input[@name='execution']/@value")[0]
return execution_id
def sso_login(self, username, password):
# get the value needed for login
execution_value = self._get_login_execution_value(SSO_URL)
data = {
'username': username,
'password': password,
'_eventId': 'submit',
'execution': execution_value
}
resp_text = self.session.post(SSO_URL, data=data).text
match_config = re.search(r"M\.cfg = (\{.*?\});", resp_text)
self.config = json.loads(match_config.group(1))
match_id = re.search(r"data\-user\-id=\"(\d+)\"", resp_text)
self.config['userid'] = int(match_id.group(1))
return self.config
def call_service(self, name, args, timeout=5):
url = urljoin(self.url, self.service_page)
params = {
'sesskey': self.config['sesskey'],
'info': name
}
# optionally add userid
if 'userid' in args:
args['userid'] = self.config['userid']
data = [
{
"index": 0,
"methodname": name,
"args": args
}
]
try:
resp = self.session.post(url, params=params, json=data, timeout=timeout)
return resp
except requests.exceptions.ReadTimeout:
return None
def user_profiles(self, values, field="id"):
"""Gets user profile information for given search values.
Args:
values (list): Search values. Type defined by 'field'.
field (str, optional): User profile field in which to search. Defaults to "id".
Returns:
list: Found user profiles.
"""
method = "core_user_get_users_by_field"
args = {
"values": values,
"field": field,
}
resp = self.call_service(method, args)
if resp.status_code == 200:
return resp.json()[0]['data']
def courselist(self, courseid, search_pattern="", guess_gender=True):
"""Gets a list of all users attending a course.
Args:
courseid (int): ID of course.
search_pattern (str, optional): Pattern used to search for certain names. Defaults to "".
guess_gender (bool, optional): Guess gender of user by name. Defaults to True.
Returns:
list: List of users.
"""
method = "core_message_data_for_messagearea_search_users_in_course "
args = {
'userid': None,
'courseid': courseid,
'search': search_pattern,
'limitnum': 0,
'limitfrom': 0
}
resp = self.call_service(method, args)
results = []
if resp.status_code == 200:
detector = gender.Detector()
users = resp.json()[0]['data']['contacts']
for user in users:
l = user['fullname'].split()
item = {
'id': user['userid'],
'firstname': l[0],
'lastname': l[1]
}
if guess_gender:
firstname = user['fullname'].split()[0]
item['gender'] = detector.get_gender(firstname)
results.append(item)
return results
def search_courses(self, search_value, page=0, perpage=50):
"""Search for courses by name.
Args:
search_value (str): Search pattern.
page (int, optional): Defaults to 0.
perpage (int, optional): Results per page. Defaults to 50.
Returns:
list: List of found courses.
"""
method = "core_course_search_courses"
args = {
'criterianame': 'search',
'criteriavalue': search_value,
'page': page,
'perpage': perpage
}
resp = self.call_service(method, args)
results = []
if resp.status_code == 200:
results = resp.json()[0]['data']['courses']
return results
def extract_user_courses(self, search_value, userid, page=0, perpage=50, no_progress=False):
"""Search for courses a user attends.
Args:
search_value (str): Search pattern to search for courses.
userid (int): ID of user to search for.
page (int, optional): Defaults to 0.
perpage (int, optional): Results per page. Defaults to 50.
no_progress (bool, optional): No progress bar. Defaults to False.
Returns:
list: List of matching courses.
"""
all_courses = self.search_courses(search_value, page, perpage)
matching_courses = []
progbar = tqdm.tqdm(all_courses, disable=no_progress)
for result in progbar:
progbar.set_description(result['shortname'][:20])
users = self.courselist(result['id'])
if len([user for user in users if user['id'] == userid]) > 0:
matching_courses.append(result)
return matching_courses
def to_csv(results, file=None):
if file:
f = open(file, 'w', encoding='utf-8', newline='')
else:
f = sys.stdout
writer = csv.writer(f)
for i, result in enumerate(results):
if i == 0:
writer.writerow(result.keys())
writer.writerow(result.values())
if f != sys.stdout:
f.close()
if __name__ == "__main__":
# argparse
parser = argparse.ArgumentParser(description="Some scraping utilities for Moodle.")
parser.add_argument('-o', dest="output", type=str, default=None, help="the output file (stdout if empty)")
parser.add_argument('-c', '--config', type=str, default="config.json", help="the config file")
parser.add_argument('-q', '--quiet', action="store_true")
subparsers = parser.add_subparsers(
title="scraping commands", dest="command",
metavar="COMMAND", description="use 'moodle.py COMMAND -h' to show help for command"
)
subparsers.required = True
# parser profiles
parser_profiles = subparsers.add_parser("profiles", help="get user profiles")
parser_profiles.add_argument('ids', type=str, help="comma-separated list of user ids")
# parser courselist
parser_courselist = subparsers.add_parser("courselist", help="get list of all users in course")
parser_courselist.add_argument('courseid', type=int, help="course id")
parser_courselist.add_argument('--gender', action="store_true", help="guess gender of users")
# parser searchcourses
parser_searchcourses = subparsers.add_parser("searchcourses", help="search courses")
parser_searchcourses.add_argument('searchpattern', type=str, help="course name search pattern")
parser_searchcourses.add_argument('--limit', type=int, default=50, help="limit of courses to be searched")
# parser usercourses
parser_usercourses = subparsers.add_parser("usercourses", help="search courses a user attends")
parser_usercourses.add_argument('searchpattern', type=str, help="course name search pattern")
parser_usercourses.add_argument('userid', type=int, help="user id to search for")
parser_usercourses.add_argument('--limit', type=int, default=50, help="limit of courses to be searched")
args = parser.parse_args()
# execute
moodle = Moodle()
with open(args.config, 'r') as f:
config = json.load(f)
moodle.sso_login(config['username'], config['password'])
results = []
if args.command == 'profiles':
results = moodle.user_profiles(args.ids.split(','))
elif args.command == 'courselist':
results = moodle.courselist(args.courseid, guess_gender=args.gender)
elif args.command == 'searchcourses':
results = moodle.search_courses(args.searchpattern, perpage=args.limit)
elif args.command == 'usercourses':
results = moodle.extract_user_courses(args.searchpattern, args.userid, perpage=args.limit, no_progress=args.quiet)
if results:
to_csv(results, args.output)
gender_guesser == 0.4.0
lxml == 4.5.1
requests == 2.23.0
tqdm == 4.46.0
{
"username": "abcdefg",
"password": "password123"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment