Skip to content

Instantly share code, notes, and snippets.

@edelooff
Created July 22, 2022 22:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edelooff/9fc9d3ed910a3b4737ae59d3897b0383 to your computer and use it in GitHub Desktop.
Save edelooff/9fc9d3ed910a3b4737ae59d3897b0383 to your computer and use it in GitHub Desktop.
AO3 freeform tag analysis given a search tag
"""Retrieve AO3 freeform tag popularity for works of a given tag.
$ python freeform-tags.py "Clark Kent"
Requirements:
* cssselect
* lxml
* requests
"""
from collections import Counter
from typing import Optional
from urllib.parse import urljoin
import argparse
import csv
import time
import requests
from lxml import html
def print_top(counter, top: Optional[int] = None) -> None:
if top is None:
top = len(counter)
width = len(str(top))
for rank, (value, occurrences) in enumerate(counter.most_common(top), 1):
print(f"{rank:{width}d}: {value!r} with {occurrences} occurrences")
def tag_search_substitution(tag: str) -> str:
return tag.replace(".", "*d*").replace("?", "*q*").replace("/", "*s*")
def single_tag_work_search(tag: str) -> Counter[str]:
search_safe = tag_search_substitution(tag)
search_url = f"https://archiveofourown.org/tags/{search_safe}/works"
freeform_tags: Counter[str] = Counter()
for pagenum, page in enumerate(search_paginator(search_url), 1):
freeform_tag_elems = page.cssselect(".work .tags .freeforms .tag")
freeform_tags.update(elem.text for elem in freeform_tag_elems)
print(f"\nIntermediate result after fetching page {pagenum}")
print_top(freeform_tags, 5)
return freeform_tags
def search_paginator(search_url: str) -> html.HtmlElement:
while True:
yield (page := fetch_page(search_url))
next_page_link = next(iter(page.cssselect(".pagination .next a")), None)
if next_page_link is None:
break # Exhausted the search!
search_url = urljoin(search_url, next_page_link.get("href"))
def fetch_page(url: str, tries: int = 30) -> html.HtmlElement:
for attempt in range(1, tries + 1):
response = requests.get(url)
if response.text.startswith("Retry later"):
print(f"Rate limited, remorseful pause ({attempt}/{tries})")
time.sleep(10)
continue
return html.fromstring(response.text)
raise SystemExit("Persistent rate limit :(")
def main() -> None:
parser = argparse.ArgumentParser(description="Search AO3 for freeform tags")
parser.add_argument(
"tag",
metavar="TAG",
type=str,
help="The tag we want to find associated freeform tags for",
)
args = parser.parse_args()
print(f"Searching for works tagged {args.tag!r}")
freeform_tags = single_tag_work_search(args.tag)
filtered = Counter(
{tag: count for tag, count in freeform_tags.items() if count >= 10}
)
print("\nResults for tags with at least 10 occurrences:")
print_top(filtered)
print("\nExporting full result set to freeform-tags.csv")
with open("freeform-tags.csv", "w") as fp:
writer = csv.writer(fp)
writer.writerow(["tag", "occurrences"])
writer.writerows(freeform_tags.most_common())
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment