Skip to content

Instantly share code, notes, and snippets.

@snorkysnark
Created October 28, 2022 13:41
Show Gist options
  • Save snorkysnark/94d17d3d2bfa75caa6a99d30ba2c619a to your computer and use it in GitHub Desktop.
Save snorkysnark/94d17d3d2bfa75caa6a99d30ba2c619a to your computer and use it in GitHub Desktop.
from ralsei import (
FnBuilder,
MapToNewColumns,
RalseiCli,
MapToNewTable,
Table,
ValueColumn,
GeneratorBuilder,
)
import requests
from tqdm import tqdm
import qs_funcs
def download(url: str):
tqdm.write("Downloading " + url)
response = requests.get(url)
response.raise_for_status()
return {"html": response.text}
def make_pipeline(args):
TABLE_uni_ranks_source = Table("ranks", "unis")
TABLE_uni_year_source = Table("qs_years", "unis")
TABLE_uni_pages = Table("pages", "uni_stats")
return {
"download": MapToNewTable(
select="""\
SELECT uni_url FROM {{ source }} r
JOIN {{ years_table }} s ON r.source_uuid = s.uuid
WHERE year = {{ year }} AND NOT {{ is_done }}""",
table=TABLE_uni_pages,
columns=[
ValueColumn("uni_url", "TEXT PRIMARY KEY"),
ValueColumn("html", "TEXT"),
"date_downloaded DATE DEFAULT NOW()",
],
fn=GeneratorBuilder.from_fn(download)
.rename_input({"uni_url": "url"})
.pop_id_fields("uni_url", keep=True),
source_table=TABLE_uni_ranks_source,
is_done_column="__uni_downloaded",
params={
"years_table": TABLE_uni_year_source,
"year": args.year,
},
),
"parse": MapToNewColumns(
select="SELECT uni_url, html FROM {{ table }}",
table=TABLE_uni_pages,
columns=[
ValueColumn("name", "TEXT"),
ValueColumn("total_students", "INT"),
ValueColumn("international_students", "INT"),
ValueColumn("total_faculty_staff", "INT"),
ValueColumn("total_ug_students_percent", "FLOAT"),
ValueColumn("total_pg_students_percent", "FLOAT"),
ValueColumn("international_ug_students_percent", "FLOAT"),
ValueColumn("international_pg_students_percent", "FLOAT"),
ValueColumn("domestic_staff_percent", "FLOAT"),
ValueColumn("international_staff_percent", "FLOAT"),
],
fn=FnBuilder(qs_funcs.parse_uni_stats).pop_id_fields("uni_url"),
),
}
if __name__ == "__main__":
cli = RalseiCli()
cli.add_argument("--year", type=int, default=2023)
cli.run(make_pipeline, "dbname=qs_new")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment