Extracting HB2 Tweet IDs from multiple twarc datasets
# This script requires the jq utility | |
# https://stedolan.github.io/jq/ | |
# Datasets created with twarc | |
# https://github.com/DocNow/twarc | |
mkdir -p NCHB2-ids | |
rm NCHB2-ids/NCHB2* | |
touch NCHB2-ids/NCHB2-ids-with-dupes.txt | |
# Create more relevant subset of "North Carlina" search | |
jq -c '{id, text}' North_Carolina/*.json | grep -Eih 'text.*(hb2|bill|bathroom|KeepNCFair)' | jq '.id' >> NCHB2-ids/NCHB2-ids-with-dupes.txt | |
# Get the IDs for all of the Tweets in the other searches | |
jq '.id' `find -L . -name "*.json" -not -path "./North_Carolina/*"` >> NCHB2-ids/NCHB2-ids-with-dupes.txt | |
pushd NCHB2-ids | |
# Sort IDs and remove duplicates | |
sort -u NCHB2-ids-with-dupes.txt > NCHB2-ids.txt | |
# Break into files with 50,000 Tweet IDs each | |
split -l 50000 NCHB2-ids.txt NCHB2-ids. | |
popd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment