Tomaz Bratanic tomasonjo

## Neo4j Graph Model Rule Book
Rules for labels
1. We use labels to group entities together -- (:Person),(:Company)
2. We use additional labels as a preprocessed way of filtering nodes faster -- (:Person:Expert),(:Company:VIP)
3. We can use labels for marking steps in our process -- (:Order:ExportedToElastic),(:Order:Error)
**************************************************************************************************************************************
Rules for date format
1. Neo4j does not support date format out of the box
2. Parse all dates to unix epoch time in seconds or miliseconds
3. Pick seconds or miliseconds and be consistent
4. Use (days * hours * minutes * seconds) in calculations for cleannes of the code

## Paradise
USING PERIODIC COMMIT 2000
LOAD CSV WITH HEADERS FROM "file:///paradise_papers.nodes.officer.csv" as row
MERGE (n:Node{id:row.`n.node_id`})
SET n:Officer
SET n.service_provider = row.`n.service_provider`,
    n.jurisdiction_description = row.`n.jurisdiction_description`,
    n.status = row.`n.status`,
    n.country_codes = row.`n.country_codes`,
    n.type = row.`n.type`,
    n.note = row.`n.type`,

## Paradise papers
Infer a network

MATCH (a:Address)<-[:registered_address]-(o:Officer)-[:officer_of]->()<-[:officer_of]-(o2:Officer)-[:registered_address]->(a2:Address)
WHERE id(o) > id(o2) AND a.name contains "Zurich" AND a2.name contains "Zurich"
WITH o,o2,count(*) as common_investments
MERGE (o)-[c:COMMON_INVESTMENTS]-(o2)
ON CREATE SET c.weight = common_investments

--------------------------------------------------------------------------------------

## Medium_Survey_All_Correlations.csv

          
            key_1
             key_2
             pearson

            
              Medicine
              Biology
               0.6751690175278219

            
              Chemistry
              Biology
               0.6580361718554997

            
              Fantasy/Fairy tales
              Animated
               0.6508308637290211

            
              Shopping centres
              Shopping
               0.6443774884976909

            
              Medicine
              Chemistry
               0.6119966796637772

            
              Physics
              Mathematics
               0.5870842251467656

            
              Opera
              Classical music
               0.5809496903367943

            
              Snakes
              Rats
               0.5681984607930817

            
              Weight
              Gender_vec
               0.541795647440021

## survey_categories_properties.csv

          
            property
            unique values

            
              Gender
              ['female', 'male']

            
              Left - right handed
              ['left handed', 'right handed']

            
              Lying
              ['sometimes', 'everytime it suits me', 'only to avoid hurting someone', 'never']

            
              Alcohol
              ['social drinker', 'never', 'drink a lot']

            
              Education
              ['secondary school', 'primary school', 'college/bachelor degree', 'masters degree', 'doctorate degree', 'currently a primary school pupil']

            
              Smoking
              ['current smoker', 'tried smoking', 'never smoked', 'former smoker']

            
              House - block of flats
              ['block of flats', 'house/bungalow']

            
              Village - town
              ['city', 'village']

            
              Punctuality
              ['i am often early', 'i am often running late', 'i am always on time']

## Medium_Survey_Stdev.csv

          
            key
             average
             std

            
              Personality
               3.2922465208747522
               0.6434356809234291

            
              Music
               4.731876861966243
               0.6640489340478044

            
              Dreams
               3.2970297029702955
               0.683147766788056

            
              Movies
               4.613545816733062
               0.6946999014202662

            
              Fun with friends
               4.5576540755467185
               0.7371830636089882

            
              Comedy
               4.494538232373387
               0.7797894145803115

            
              Internet_vec
               3.838613861386136
               0.8213540389444351

            
              Happiness in life
               3.705765407554671
               0.8243233683199775

            
              Slow songs or fast songs
               3.3283730158730185
               0.8339307217064154

## Medium_Survey_Gender_correlation.csv

          
            key_1
             key_2
             pearson

            
              Gender_vec
              Weight
               0.541795647440021

            
              Gender_vec
              PC
               0.4595381175639033

            
              Gender_vec
              Cars
               0.43821572092706285

            
              Gender_vec
              Action
               0.4093180552569303

            
              Gender_vec
              War
               0.40744466090777826

            
              Gender_vec
              Science and technology
               0.3575550988826724

            
              Gender_vec
              Western
               0.3482424112983126

            
              Gender_vec
              Sci-fi
               0.3092600003234222

            
              Gender_vec
              Physics
               0.3051120080067347

## Medium_Survey_Results.csv
community,size,male_percentage,top_3,bottom_3
3,220,0.004545454545454545,"['Compassion to animals', 'Romantic', 'Borrowed stuff']","['Metal or Hardrock', 'Writing', 'Western']"
4,190,0.9315789473684211,"['Cheating in school', 'Action', 'PC']","['Storm', 'Gardening', 'Writing']"
6,175,0.7428571428571429,"['Keeping promises', 'Internet_vec', 'Borrowed stuff']","['Writing', 'Darkness', 'Storm']"
5,125,0.008,"['Fantasy/Fairy tales', 'Empathy', 'Foreign languages']","['Cars', 'Hypochondria', 'Metal or Hardrock']"
2,101,0.7425742574257426,"['Rock', 'Borrowed stuff', 'Keeping promises']","['Darkness', 'Celebrities', 'Storm']"
0,98,0.030612244897959183,"['Empathy', 'Compassion to animals', 'Judgment calls']","['Eating to survive', 'Gardening', 'Hypochondria']"
75,2,0.0,"['Reliability', 'Reading', 'Countryside, outdoors']","['Heights', 'Western', 'Storm']"
44,2,0.0,"['Reliability', 'Politics', 'Romantic']","['Getting up', 'Spending on gadgets', 'Western']"
43,1,0.0,"['Finding lost valuables', 'Active sport', 'Reading'

## gutenberg_blog_preprocess
# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

# Fetch the data
target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
import urllib.request
data = urllib.request.urlopen(target_url)
raw_data = data.read().decode('utf8').strip()

# Preprocess text into chapters
import re

## gutenberg_blog_spacy
# Analyze the first chapter
c = chapters[0]
# Get a list of persons
doc=nlp(c)
involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
# replace names of involved in the text
# with an id and save the mapping
decode = dict()
for i,x in enumerate(involved):
    # Get mapping
	Rules for labels
	1. We use labels to group entities together -- (:Person),(:Company)
	2. We use additional labels as a preprocessed way of filtering nodes faster -- (:Person:Expert),(:Company:VIP)
	3. We can use labels for marking steps in our process -- (:Order:ExportedToElastic),(:Order:Error)
	**************************************************************************************************************************************
	Rules for date format
	1. Neo4j does not support date format out of the box
	2. Parse all dates to unix epoch time in seconds or miliseconds
	3. Pick seconds or miliseconds and be consistent
	4. Use (days * hours * minutes * seconds) in calculations for cleannes of the code
	USING PERIODIC COMMIT 2000
	LOAD CSV WITH HEADERS FROM "file:///paradise_papers.nodes.officer.csv" as row
	MERGE (n:Node{id:row.`n.node_id`})
	SET n:Officer
	SET n.service_provider = row.`n.service_provider`,
	n.jurisdiction_description = row.`n.jurisdiction_description`,
	n.status = row.`n.status`,
	n.country_codes = row.`n.country_codes`,
	n.type = row.`n.type`,
	n.note = row.`n.type`,
	Infer a network

	MATCH (a:Address)<-[:registered_address]-(o:Officer)-[:officer_of]->()<-[:officer_of]-(o2:Officer)-[:registered_address]->(a2:Address)
	WHERE id(o) > id(o2) AND a.name contains "Zurich" AND a2.name contains "Zurich"
	WITH o,o2,count(*) as common_investments
	MERGE (o)-[c:COMMON_INVESTMENTS]-(o2)
	ON CREATE SET c.weight = common_investments

	--------------------------------------------------------------------------------------
key_1	key_2	pearson
Medicine	Biology	0.6751690175278219
Chemistry	Biology	0.6580361718554997
Fantasy/Fairy tales	Animated	0.6508308637290211
Shopping centres	Shopping	0.6443774884976909
Medicine	Chemistry	0.6119966796637772
Physics	Mathematics	0.5870842251467656
Opera	Classical music	0.5809496903367943
Snakes	Rats	0.5681984607930817
Weight	Gender_vec	0.541795647440021
	property	unique values
	Gender	['female', 'male']
	Left - right handed	['left handed', 'right handed']
	Lying	['sometimes', 'everytime it suits me', 'only to avoid hurting someone', 'never']
	Alcohol	['social drinker', 'never', 'drink a lot']
	Education	['secondary school', 'primary school', 'college/bachelor degree', 'masters degree', 'doctorate degree', 'currently a primary school pupil']
	Smoking	['current smoker', 'tried smoking', 'never smoked', 'former smoker']
	House - block of flats	['block of flats', 'house/bungalow']
	Village - town	['city', 'village']
	Punctuality	['i am often early', 'i am often running late', 'i am always on time']
key	average	std
Personality	3.2922465208747522	0.6434356809234291
Music	4.731876861966243	0.6640489340478044
Dreams	3.2970297029702955	0.683147766788056
Movies	4.613545816733062	0.6946999014202662
Fun with friends	4.5576540755467185	0.7371830636089882
Comedy	4.494538232373387	0.7797894145803115
Internet_vec	3.838613861386136	0.8213540389444351
Happiness in life	3.705765407554671	0.8243233683199775
Slow songs or fast songs	3.3283730158730185	0.8339307217064154
	community,size,male_percentage,top_3,bottom_3
	3,220,0.004545454545454545,"['Compassion to animals', 'Romantic', 'Borrowed stuff']","['Metal or Hardrock', 'Writing', 'Western']"
	4,190,0.9315789473684211,"['Cheating in school', 'Action', 'PC']","['Storm', 'Gardening', 'Writing']"
	6,175,0.7428571428571429,"['Keeping promises', 'Internet_vec', 'Borrowed stuff']","['Writing', 'Darkness', 'Storm']"
	5,125,0.008,"['Fantasy/Fairy tales', 'Empathy', 'Foreign languages']","['Cars', 'Hypochondria', 'Metal or Hardrock']"
	2,101,0.7425742574257426,"['Rock', 'Borrowed stuff', 'Keeping promises']","['Darkness', 'Celebrities', 'Storm']"
	0,98,0.030612244897959183,"['Empathy', 'Compassion to animals', 'Judgment calls']","['Eating to survive', 'Gardening', 'Hypochondria']"
	75,2,0.0,"['Reliability', 'Reading', 'Countryside, outdoors']","['Heights', 'Western', 'Storm']"
	44,2,0.0,"['Reliability', 'Politics', 'Romantic']","['Getting up', 'Spending on gadgets', 'Western']"
	43,1,0.0,"['Finding lost valuables', 'Active sport', 'Reading'
	# https://www.gutenberg.org/ebooks/95 Prisoner of Zelda

	# Fetch the data
	target_url = 'https://www.gutenberg.org/files/95/95-0.txt'
	import urllib.request
	data = urllib.request.urlopen(target_url)
	raw_data = data.read().decode('utf8').strip()

	# Preprocess text into chapters
	import re
	# Analyze the first chapter
	c = chapters[0]
	# Get a list of persons
	doc=nlp(c)
	involved = list(set([ent.text for ent in doc.ents if ent.label_=='PERSON']))
	# replace names of involved in the text
	# with an id and save the mapping
	decode = dict()
	for i,x in enumerate(involved):
	# Get mapping