|
"""Retrieve AO3 freeform tag popularity for works of a given tag. |
|
|
|
$ python freeform-tags.py "Clark Kent" |
|
|
|
Requirements: |
|
* cssselect |
|
* lxml |
|
* requests |
|
""" |
|
|
|
from collections import Counter |
|
from typing import Optional |
|
from urllib.parse import urljoin |
|
import argparse |
|
import csv |
|
import time |
|
|
|
import requests |
|
from lxml import html |
|
|
|
|
|
def print_top(counter, top: Optional[int] = None) -> None: |
|
if top is None: |
|
top = len(counter) |
|
width = len(str(top)) |
|
for rank, (value, occurrences) in enumerate(counter.most_common(top), 1): |
|
print(f"{rank:{width}d}: {value!r} with {occurrences} occurrences") |
|
|
|
|
|
def tag_search_substitution(tag: str) -> str: |
|
return tag.replace(".", "*d*").replace("?", "*q*").replace("/", "*s*") |
|
|
|
|
|
def single_tag_work_search(tag: str) -> Counter[str]: |
|
search_safe = tag_search_substitution(tag) |
|
search_url = f"https://archiveofourown.org/tags/{search_safe}/works" |
|
freeform_tags: Counter[str] = Counter() |
|
|
|
for pagenum, page in enumerate(search_paginator(search_url), 1): |
|
freeform_tag_elems = page.cssselect(".work .tags .freeforms .tag") |
|
freeform_tags.update(elem.text for elem in freeform_tag_elems) |
|
print(f"\nIntermediate result after fetching page {pagenum}") |
|
print_top(freeform_tags, 5) |
|
return freeform_tags |
|
|
|
|
|
def search_paginator(search_url: str) -> html.HtmlElement: |
|
while True: |
|
yield (page := fetch_page(search_url)) |
|
next_page_link = next(iter(page.cssselect(".pagination .next a")), None) |
|
if next_page_link is None: |
|
break # Exhausted the search! |
|
search_url = urljoin(search_url, next_page_link.get("href")) |
|
|
|
|
|
def fetch_page(url: str, tries: int = 30) -> html.HtmlElement: |
|
for attempt in range(1, tries + 1): |
|
response = requests.get(url) |
|
if response.text.startswith("Retry later"): |
|
print(f"Rate limited, remorseful pause ({attempt}/{tries})") |
|
time.sleep(10) |
|
continue |
|
return html.fromstring(response.text) |
|
raise SystemExit("Persistent rate limit :(") |
|
|
|
|
|
def main() -> None: |
|
parser = argparse.ArgumentParser(description="Search AO3 for freeform tags") |
|
parser.add_argument( |
|
"tag", |
|
metavar="TAG", |
|
type=str, |
|
help="The tag we want to find associated freeform tags for", |
|
) |
|
args = parser.parse_args() |
|
|
|
print(f"Searching for works tagged {args.tag!r}") |
|
freeform_tags = single_tag_work_search(args.tag) |
|
filtered = Counter( |
|
{tag: count for tag, count in freeform_tags.items() if count >= 10} |
|
) |
|
print("\nResults for tags with at least 10 occurrences:") |
|
print_top(filtered) |
|
|
|
print("\nExporting full result set to freeform-tags.csv") |
|
with open("freeform-tags.csv", "w") as fp: |
|
writer = csv.writer(fp) |
|
writer.writerow(["tag", "occurrences"]) |
|
writer.writerows(freeform_tags.most_common()) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |