knbknb/clean-tweets.sh

## clean-tweets.sh
#!/bin/sh
# knb 2019-06 -- untested
#
# Clean tweets .json files taken from twitter streaming API.
# (can probably also remove non-tweet-objects). One JSON object per line is important.
#
# Some tweets might have been corrupted by errors made by the application,
# or by the operating system
# But tweets must be well-formed
# in order to read them in quickly by R or some other postprocessing app.
#
# infile: 1 tweet/line, most of them well-formed, 1 tweet/line.
infile_orig=tweets_file.json
infile=some_file.json
outfile=some_other_file.json

# remove duplicate tweets
uniq <$infile_orig > $outfile
mv $outfile $infile

# use jq to pretty-print the well-formed tweets
# (not-well-formed lines will stay in place, long lines)
< $infile > $outfile jq -R -r '. as $line | try fromjson catch $line'
# find un-pretty printed lines, and the linenumber in the file
# (remove them manually if possible).
# They typically start with {" .
# Pretty printed lines have their starting { all on a single line.
perl -ne '/^{"/ && print qq($. $_)' < stream__34c3._2.json

# use jq again to check if parsing errors remain
jq . < $infile 1>/dev/null

# compact infile, back to one-tweet-per row format
jq -c .  < $infile > $outfile
	#!/bin/sh
	# knb 2019-06 -- untested
	#
	# Clean tweets .json files taken from twitter streaming API.
	# (can probably also remove non-tweet-objects). One JSON object per line is important.
	#
	# Some tweets might have been corrupted by errors made by the application,
	# or by the operating system
	# But tweets must be well-formed
	# in order to read them in quickly by R or some other postprocessing app.
	#
	# infile: 1 tweet/line, most of them well-formed, 1 tweet/line.
	infile_orig=tweets_file.json
	infile=some_file.json
	outfile=some_other_file.json

	# remove duplicate tweets
	uniq <$infile_orig > $outfile
	mv $outfile $infile

	# use jq to pretty-print the well-formed tweets
	# (not-well-formed lines will stay in place, long lines)
	< $infile > $outfile jq -R -r '. as $line \| try fromjson catch $line'
	# find un-pretty printed lines, and the linenumber in the file
	# (remove them manually if possible).
	# They typically start with {" .
	# Pretty printed lines have their starting { all on a single line.
	perl -ne '/^{"/ && print qq($. $_)' < stream__34c3._2.json

	# use jq again to check if parsing errors remain
	jq . < $infile 1>/dev/null

	# compact infile, back to one-tweet-per row format
	jq -c . < $infile > $outfile