Brandon Locke brandontlocke

## batchner-collapser.py
# script created by Devin Higgins, adapted by Brandon Locke
import csv
import sys

if len(sys.argv) != 2:
    raise ValueError('Please provide a batchner file')

batchner = sys.argv[1]

with open(batchner, "rU") as csvfile:

## youtube-dl scripts
https://gitlab.com/TheFrenchGhosty/TheFrenchGhostys-YouTube-DL-Archivist-Scripts

## inaugural-batchner
doc,entity,entityType,count
10_adams_john_quincy_1825,Army,organization,1
10_adams_john_quincy_1825,Congress,organization,1
10_adams_john_quincy_1825,Executive Magistrate,organization,1
10_adams_john_quincy_1825,General Government of the Union,organization,1
10_adams_john_quincy_1825,Legislature,organization,2
10_adams_john_quincy_1825,Navy,organization,1
10_adams_john_quincy_1825,State,organization,3
10_adams_john_quincy_1825,John Quincy Adams,person,1
10_adams_john_quincy_1825,Europe,location,3

## saltheader.sh
for file in *.txt
	do
		cat $file | awk '{print; if (FNR % 25 == 0 ) printf "NORTH CAROLINA WRITERS PROJECT         2\n";}' > salt/$file
	done

for file in salt/*.txt
	do
		echo "NORTH CAROLINA WRITERS PROJECT      1\n$(cat $file)" > $file
	done

## filerename
sed -i -e '$'\n'' test.csv | cat test.csv | while IFS=, read -r orig new trash; do mv "$orig".txt "$new".txt; done

## dlhref.sh
cat list.txt | while read line
do
   pdflink=$(echo $line | grep -o 'https://.*pdf')
   name=$(echo $line | grep -o '\">.*</a>' | sed 's/\">//' | sed 's/<\/a>//' | sed "s/[^[:alnum:]-]//g")
   fname=$name".pdf"
   curl $pdflink -o $fname
   sleep 15s
done

## flh-split-into-series-and-create-network.py
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite

fullset=pd.read_csv('https://raw.githubusercontent.com/FannieLouHamerPapers/NamedEntities/master/flh_ner_all.csv', low_memory=False)

Delta_Opportunities_Corporation_DOC_Series=fullset.loc[fullset.doc_title_full.str.contains('Delta Opportunities Corporation (D', regex=False)]
Freedom_Farms_Corporation_FFC_Series=fullset.loc[fullset.doc_title_full.str.contains('Freedom Farms Corporation (FFC) Series', regex=False)]
Other_Organization_Series_I=fullset.loc[fullset.doc_title_full.str.contains('Other Organization Series I:|Other Organization Series I,|Other Organization Series I ', regex=True)]
Other_Organization_Series_II=fullset.loc[fullset.doc_title_full.str.contains('Other Organization Series II:', regex=False)]

## batchner to network workflow
1. run batchner on a directory
2. (OPTIONAL) clean in openrefine
3. (REQUIRED ONLY IF 2 COMPLETED) run batchner-collapse script to merge duplicates
4. (OPTIONAL) run metadata merge
5. (OPTIONAL) run script NER-derivatives to generate derivatives by entityType
6. batchner to network

## batchner to network workflow
1. run batchner on a directory
2. (OPTIONAL) clean in openrefine
3. (REQUIRED ONLY IF 2 COMPLETED) run batchner-collapse script to merge duplicates
4. (OPTIONAL) run metadata merge
5. (OPTIONAL) run script [NOT YET CREATED] to generate derivatives by entityType
6. batchner to network

NEEDED
create node lists from the network that looks up metadata

## flh-metadatamerge.py
import pandas as pd

#read in data
entities = pd.read_csv('https://raw.githubusercontent.com/FannieLouHamerPapers/NamedEntities/master/flh_ner_all.csv')
metadata = pd.read_csv('flhmetadata.csv')

#cut '.txt' from the doc names
entities.doc = entities.doc.str[:16]

#join dataframes; select only some
	# script created by Devin Higgins, adapted by Brandon Locke
	import csv
	import sys

	if len(sys.argv) != 2:
	raise ValueError('Please provide a batchner file')

	batchner = sys.argv[1]

	with open(batchner, "rU") as csvfile:
	doc,entity,entityType,count
	10_adams_john_quincy_1825,Army,organization,1
	10_adams_john_quincy_1825,Congress,organization,1
	10_adams_john_quincy_1825,Executive Magistrate,organization,1
	10_adams_john_quincy_1825,General Government of the Union,organization,1
	10_adams_john_quincy_1825,Legislature,organization,2
	10_adams_john_quincy_1825,Navy,organization,1
	10_adams_john_quincy_1825,State,organization,3
	10_adams_john_quincy_1825,John Quincy Adams,person,1
	10_adams_john_quincy_1825,Europe,location,3
	for file in *.txt
	do
	cat $file \| awk '{print; if (FNR % 25 == 0 ) printf "NORTH CAROLINA WRITERS PROJECT 2\n";}' > salt/$file
	done

	for file in salt/*.txt
	do
	echo "NORTH CAROLINA WRITERS PROJECT 1\n$(cat $file)" > $file
	done
	cat list.txt \| while read line
	do
	pdflink=$(echo $line \| grep -o 'https://.*pdf')
	name=$(echo $line \| grep -o '\">.*</a>' \| sed 's/\">//' \| sed 's/<\/a>//' \| sed "s/[^[:alnum:]-]//g")
	fname=$name".pdf"
	curl $pdflink -o $fname
	sleep 15s
	done
	import pandas as pd
	import networkx as nx
	from networkx.algorithms import bipartite

	fullset=pd.read_csv('https://raw.githubusercontent.com/FannieLouHamerPapers/NamedEntities/master/flh_ner_all.csv', low_memory=False)

	Delta_Opportunities_Corporation_DOC_Series=fullset.loc[fullset.doc_title_full.str.contains('Delta Opportunities Corporation (D', regex=False)]
	Freedom_Farms_Corporation_FFC_Series=fullset.loc[fullset.doc_title_full.str.contains('Freedom Farms Corporation (FFC) Series', regex=False)]
	Other_Organization_Series_I=fullset.loc[fullset.doc_title_full.str.contains('Other Organization Series I:\|Other Organization Series I,\|Other Organization Series I ', regex=True)]
	Other_Organization_Series_II=fullset.loc[fullset.doc_title_full.str.contains('Other Organization Series II:', regex=False)]
	1. run batchner on a directory
	2. (OPTIONAL) clean in openrefine
	3. (REQUIRED ONLY IF 2 COMPLETED) run batchner-collapse script to merge duplicates
	4. (OPTIONAL) run metadata merge
	5. (OPTIONAL) run script NER-derivatives to generate derivatives by entityType
	6. batchner to network
	import pandas as pd

	#read in data
	entities = pd.read_csv('https://raw.githubusercontent.com/FannieLouHamerPapers/NamedEntities/master/flh_ner_all.csv')
	metadata = pd.read_csv('flhmetadata.csv')

	#cut '.txt' from the doc names
	entities.doc = entities.doc.str[:16]

	#join dataframes; select only some