Skip to content

Instantly share code, notes, and snippets.

@kylemcdonald
Last active June 21, 2023 21:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kylemcdonald/0f1dbd597cf4fd28e09344168bd05301 to your computer and use it in GitHub Desktop.
Save kylemcdonald/0f1dbd597cf4fd28e09344168bd05301 to your computer and use it in GitHub Desktop.
Scrape LA Times officer-involved homicides.
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
import requests
from itertools import count
from joblib import Parallel, delayed
metadata = []
for page in count():
url = f'https://homicide.latimes.com/api/posts/page/{1 + page}/?officer_involved=true&year=all'
print(url)
response = requests.get(url)
data = response.json()
if len(data) == 0:
break
metadata.extend(data)
print(len(metadata))
def job(e):
slug = e['slug']
url = f'https://homicide.latimes.com/post/{slug}/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
death_date = soup.find(class_='death-date')
aspects = soup.find(class_='aspects').find_all('li')
aspects = [a.text.split(':')
for a in soup.find(class_='aspects').find_all('li')]
aspects = {a[0].strip(): a[1].strip() for a in aspects if len(a) == 2}
return {**e, **aspects, 'full_death_date': death_date.text}
results = Parallel(n_jobs=-1)(delayed(job)(task) for task in tqdm(metadata))
with open('latimes-killings.json', 'w') as f:
json.dump(results, f, indent=2)
filtered = [{**e, **e['homicide']} for e in results]
for i, e in enumerate(filtered):
del filtered[i]['homicide']
pd.DataFrame(filtered).to_csv('latimes-killings.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment