Created
June 25, 2015 14:30
-
-
Save jexp/280ed1a37368e24357c5 to your computer and use it in GitHub Desktop.
Import Script for Neo4j for the Standford SNAP: Pokec Social Network dataset using Neo4j's parallel importer (takes 30s for 1.6M nodes 30M rels)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo "Usage [pokec.db] [2.2.2|2.3-SNAPSHOT]" | |
DB=${1-pokec.db} | |
VERSION=${2-2.2.2} | |
#Import: POKEC Dataset from Stanford Snap | |
#https://snap.stanford.edu/data/soc-pokec-readme.txt | |
if [ ! -f soc-pokec-profiles.txt.gz ]; then | |
curl -OL https://snap.stanford.edu/data/soc-pokec-profiles.txt.gz | |
fi | |
if [ ! -f soc-pokec-relationships.txt.gz ]; then | |
curl -OL https://snap.stanford.edu/data/soc-pokec-relationships.txt.gz | |
fi | |
if [ ! -f soc-pokec-profiles_no_null_sorted.txt.gz ]; then | |
gzip -dc soc-pokec-profiles.txt.gz | sed -e 's/null//g' | sort -k1 -n --parallel=10 -S 5G | gzip > soc-pokec-profiles_no_null_sorted.txt.gz | |
fi | |
echo "_key:ID public:INT completion_percentage gender:INT region last_login registration AGE:INT body I_am_working_in_field spoken_languages hobbies I_most_enjoy_good_food pets body_type my_eyesight eye_color hair_color hair_type completed_level_of_education favourite_color relation_to_smoking relation_to_alcohol sign_in_zodiac on_pokec_i_am_looking_for love_is_for_me relation_to_casual_sex my_partner_should_be marital_status children relation_to_children I_like_movies I_like_watching_movie I_like_music I_mostly_like_listening_to_music the_idea_of_good_evening I_like_specialties_from_kitchen fun I_am_going_to_concerts my_active_sports my_passive_sports profession I_like_books life_style music cars politics relationships art_culture hobbies_interests science_technologies computers_internet education sport movies travelling health companies_brands more" > profiles_header.txt | |
echo ':START_ID :END_ID' > relationships_header.txt | |
#export NEO=~/Downloads/neo4j-enterprise-2.3-SNAPSHOT | |
#export NEO=/data/versions/neo4j-enterprise-2.3-SNAPSHOT | |
export NEO=/data/versions/neo4j-enterprise-$VERSION | |
export IDTYPE=integer #actual | |
rm -rf $DB | |
$NEO/bin/neo4j-import --into $DB --id-type $IDTYPE --delimiter TAB --quote Ö --nodes:PROFILES profiles_header.txt,soc-pokec-profiles_no_null_sorted.txt.gz --relationships:RELATION relationships_header.txt,soc-pokec-relationships.txt.gz | |
JAVA_OPTS="-Xmx12G -Xmn2G" $NEO/bin/neo4j-shell -path $DB -c 'CREATE CONSTRAINT ON (p:PROFILES) ASSERT p._key IS UNIQUE;' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nodes
Error in input data
Caused by:Extra column not present in header on line 0 in /home/patricia/Descargas/280ed1a37368e24357c5-a8c4a1e2f313ce8f4991c77a65a9a4b2741dcd81/soc-pokec-profiles_no_null_sorted.txt.gz with value null
Could you help me?