cazzerson/NC-HB2-ids.sh

## NC-HB2-ids.sh
# This script requires the jq utility
# https://stedolan.github.io/jq/
# Datasets created with twarc
# https://github.com/DocNow/twarc

mkdir -p NCHB2-ids
rm NCHB2-ids/NCHB2*
touch NCHB2-ids/NCHB2-ids-with-dupes.txt

# Create more relevant subset of "North Carlina" search
jq -c '{id, text}' North_Carolina/*.json | grep -Eih 'text.*(hb2|bill|bathroom|KeepNCFair)' | jq '.id' >> NCHB2-ids/NCHB2-ids-with-dupes.txt

# Get the IDs for all of the Tweets in the other searches
jq '.id' `find -L . -name "*.json" -not -path "./North_Carolina/*"` >> NCHB2-ids/NCHB2-ids-with-dupes.txt


pushd NCHB2-ids
# Sort IDs and remove duplicates
sort -u NCHB2-ids-with-dupes.txt > NCHB2-ids.txt

# Break into files with 50,000 Tweet IDs each
split -l 50000 NCHB2-ids.txt NCHB2-ids.
popd
	# This script requires the jq utility
	# https://stedolan.github.io/jq/
	# Datasets created with twarc
	# https://github.com/DocNow/twarc

	mkdir -p NCHB2-ids
	rm NCHB2-ids/NCHB2*
	touch NCHB2-ids/NCHB2-ids-with-dupes.txt

	# Create more relevant subset of "North Carlina" search
	jq -c '{id, text}' North_Carolina/.json \| grep -Eih 'text.(hb2\|bill\|bathroom\|KeepNCFair)' \| jq '.id' >> NCHB2-ids/NCHB2-ids-with-dupes.txt

	# Get the IDs for all of the Tweets in the other searches
	jq '.id' `find -L . -name ".json" -not -path "./North_Carolina/"` >> NCHB2-ids/NCHB2-ids-with-dupes.txt


	pushd NCHB2-ids
	# Sort IDs and remove duplicates
	sort -u NCHB2-ids-with-dupes.txt > NCHB2-ids.txt

	# Break into files with 50,000 Tweet IDs each
	split -l 50000 NCHB2-ids.txt NCHB2-ids.
	popd