rvanbruggen/create_gped_db.cql

## create_gped_db.cql

//create indexes
create index on :Classification(name);
create index on :SubClassification(name);
create index on :FuelType(name);
create index on :Country(name);
create index on :Plant(name);
create index on :Plant(id);

//FROM FUELTYPE TAB

//add Top-level classification
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
with distinct csv.Classification as Classification
merge (c:Classification {name: Classification});

//add FuelTypes
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
merge (ft:FuelType {name: csv.IEA_Fuel_Type});

//add Sub-level Classification
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
match (c:Classification {name: csv.Classification})
merge (sc:SubClassification {name: csv.SubClassification, fullname: csv.Full_Name})-[:SUBCLASSIFICATION_OF]-(c);

//connect the FuelTypes
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
match (sc:SubClassification {name: csv.SubClassification}), (ft:FuelType {name: csv.IEA_Fuel_Type})
merge (sc)-[:HAS_FUEL_TYPE]->(ft);

//FROM PLANTEMISSIONS TAB

//countries
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
with distinct csv.Country as country
merge (c:Country {name: country});

//plants in countries
using periodic commit
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
match (c:Country {name: csv.Country})
create (c)<-[:LOCATED_IN]-(p:Plant {id: toInt(csv.ID), name: csv.Plant_Name, number_of_units: toInt(csv.Number_of_units), capacity: toFloat(csv.Capacity_in_MW), co2: toFloat(csv.CO2_in_Mg), so2: toFloat(csv.SO2_in_Mg), NOx: toFloat(csv.NOx_in_Mg), PM25: toFloat(csv.PM25_in_Mg)});

//plants with classification
using periodic commit
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
match (cl:Classification {name: csv.Classification})
match (p:Plant {id: toInt(csv.ID)})
create (p)-[:HAS_CLASSIFICATION]->(cl);

## gped_graphgist.adoc

      
    Raw
  

              gped_graphgist.adoc
            
          
    Global Power Emissions Database as a Graph


A spreadsheet as the source - again


Like everyone, I would like to understand more about Global Warming. In one of my late night browsing sessions, I came across an interesting - if somewhat outdated - dataset about worldwide power plants and their emissions. You can find it over here. It’s available as a spreadsheet, and I have converted it into a google sheet immediately over here. I cleaned up two tabs (fueltype, and plantemissions) a little bit, and then I could quickly start importing into Neo4j.


I came up with the following main entities:
* Classifications: of Power Plant into types
* Sub-classifications: of the same Plants into subtypes
* Fuel-Types: another kind of sub-classification of the power plants. Similar but different from the Sub-classification.
* Countries
* Plants - located in Countries


And that is what we will import the info into.


Setup the database - part 1


Let’s start by creating the indexes:


//create indexes
create index on :Classification(name);
create index on :SubClassification(name);
create index on :FuelType(name);
create index on :Country(name);
create index on :Plant(name);
create index on :Plant(id);


Let’s continue.


Setup the database - part 2


Then we can add the first entities from the FuelType tab of the spreadsheet:


//add Top-level classification
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
with distinct csv.Classification as Classification
merge (c:Classification {name: Classification});

//add FuelTypes
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
merge (ft:FuelType {name: csv.IEA_Fuel_Type});

//add Sub-level Classification
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
match (c:Classification {name: csv.Classification})
merge (sc:SubClassification {name: csv.SubClassification, fullname: csv.Full_Name})-[:SUBCLASSIFICATION_OF]-(c);

//connect the FuelTypes
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
match (sc:SubClassification {name: csv.SubClassification}), (ft:FuelType {name: csv.IEA_Fuel_Type})
merge (sc)-[:HAS_FUEL_TYPE]->(ft);


This is not too bad - this tab of of the spreadsheet is pretty small.


Now we can add the bigger tab of the spreadsheet - with the plant data.


Setup the database - part 3


This is where we add the plant emissions data.


//FROM PLANTEMISSIONS TAB

//countries
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as fullcsv
with fullcsv as csv
with distinct csv.Country as country
merge (c:Country {name: country});

//plants in countries
using periodic commit
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as fullcsv
with fullcsv as csv
limit 10
match (c:Country {name: csv.Country})
create (c)<-[:LOCATED_IN]-(p:Plant {id: toInt(csv.ID), name: csv.Plant_Name, number_of_units: toInt(csv.Number_of_units), capacity: toFloat(csv.Capacity_in_MW), co2: toFloat(csv.CO2_in_Mg), so2: toFloat(csv.SO2_in_Mg), NOx: toFloat(csv.NOx_in_Mg), PM25: toFloat(csv.PM25_in_Mg)});

//plants with classification
using periodic commit
load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as fullcsv
with fullcsv as csv
limit 10
match (cl:Classification {name: csv.Classification})
match (p:Plant {id: toInt(csv.ID)})
create (p)-[:HAS_CLASSIFICATION]->(cl);


This should give you a graph with much fewer nodes.


Querying this Global Power Emissions Database


I was quite intrigued by this database, because the spreadsheet has a number of aggregations (sums by country, for example) - and I was wondering how that would work in Neo4j. Surprisingly well, it seems - as I will show you below.


Summary of plants per country


Let’s see how many plants we can find per country:


match (c:Country)--(p:Plant)--(cl:Classification)
return c.name, cl.name, count(p)
order by c.name, cl.name asc;


Summary of emissions per country


Then let’s see if we can replicate the summary spreadsheet that you can also find in the original dataset, by aggregating all the individual plant data into per country summaries:


MATCH (p:Plant)--(n:Country)
return n.name, sum(p.co2), sum(p.so2), sum(p.NOx), sum(p.PM25)
order by n.name;


As you can see it is really easy and efficient. Works like a charm.


Sorting some plants and countries


Let’s order by CO2 emmissions:


//largest co2 polluting plants
MATCH (p:Plant)--(conn)
return p,conn
order by p.co2 desc
limit 5;


and look at the result:


Let’s order by NOx emmissions:


MATCH (p:Plant)--(conn)
return p,conn
order by p.NOx desc
limit 25;


and look at the table:


Let’s order by SO2 emmissions:


MATCH (p:Plant)--(conn)
return p,conn
order by p.so2 desc
limit 25;


and take a final peak at that:


Looking at some connections


Let’s take a peak at some interesting graphs. For example, what if we wanted to know the least and most efficient power plants in terms of CO2 emissions.


Least efficient Plants


Let’s see which plants produce the LEAST power (expressed in MegaWatt) per CO2 emission (in Mg).
Here’s our query:


match path = ((c:Classification)--(p:Plant)--(co:Country))
return path
order by p.co2/p.capacity desc
limit 25;


And look at the result:


Most efficient Plants


Then we look at the MOST efficient power plants, who produce the most power for the least (but not 0) CO2:


match path = ((c:Classification)--(p:Plant)--(co:Country))
where p.co2 > 0
return path
order by p.co2/p.capacity asc
limit 25;


And present the result:


Conclusion


Hope this was an interesting exercise. More info or questions always welcome.


Created by Rik Van Bruggen - Twitter | Blog | LinkedIn


## query_gped_db.cql
//queries

//summary of plants per country
match (c:Country)--(p:Plant)--(cl:Classification)
return c.name, cl.name, count(p)
order by c.name, cl.name asc;

//summary of emissions per country
MATCH (p:Plant)--(n:Country)
return n.name, sum(p.co2), sum(p.so2), sum(p.NOx), sum(p.PM25)
order by n.name;

//largest co2 polluting plants
MATCH (p:Plant)--(conn)
return p,conn
order by p.co2 desc
limit 25;

MATCH (p:Plant)--(conn)
return p,conn
order by p.NOx desc
limit 25;

MATCH (p:Plant)--(conn)
return p,conn
order by p.so2 desc
limit 25;


//least efficient plants
match path = ((c:Classification)--(p:Plant)--(co:Country))
return co.name, c.name, p.name, p.co2, p.capacity, p.co2/p.capacity
order by p.co2/p.capacity desc
limit 25;

match path = ((c:Classification)--(p:Plant)--(co:Country))
return path
order by p.co2/p.capacity desc
limit 25;

//most efficient but co2 > 0
match path = ((c:Classification)--(p:Plant)--(co:Country))
where p.co2 > 0
return co.name, c.name, p.name, p.co2, p.capacity, p.co2/p.capacity
order by p.co2/p.capacity asc
limit 25;

match path = ((c:Classification)--(p:Plant)--(co:Country))
where p.co2 > 0
return path
order by p.co2/p.capacity asc
limit 25;

	//create indexes
	create index on :Classification(name);
	create index on :SubClassification(name);
	create index on :FuelType(name);
	create index on :Country(name);
	create index on :Plant(name);
	create index on :Plant(id);

	//FROM FUELTYPE TAB

	//add Top-level classification
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
	with distinct csv.Classification as Classification
	merge (c:Classification {name: Classification});

	//add FuelTypes
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
	merge (ft:FuelType {name: csv.IEA_Fuel_Type});

	//add Sub-level Classification
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
	match (c:Classification {name: csv.Classification})
	merge (sc:SubClassification {name: csv.SubClassification, fullname: csv.Full_Name})-[:SUBCLASSIFICATION_OF]-(c);

	//connect the FuelTypes
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=1877461938" as csv
	match (sc:SubClassification {name: csv.SubClassification}), (ft:FuelType {name: csv.IEA_Fuel_Type})
	merge (sc)-[:HAS_FUEL_TYPE]->(ft);

	//FROM PLANTEMISSIONS TAB

	//countries
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
	with distinct csv.Country as country
	merge (c:Country {name: country});

	//plants in countries
	using periodic commit
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
	match (c:Country {name: csv.Country})
	create (c)<-[:LOCATED_IN]-(p:Plant {id: toInt(csv.ID), name: csv.Plant_Name, number_of_units: toInt(csv.Number_of_units), capacity: toFloat(csv.Capacity_in_MW), co2: toFloat(csv.CO2_in_Mg), so2: toFloat(csv.SO2_in_Mg), NOx: toFloat(csv.NOx_in_Mg), PM25: toFloat(csv.PM25_in_Mg)});

	//plants with classification
	using periodic commit
	load csv with headers from "https://docs.google.com/spreadsheets/d/1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk/export?format=csv&id=1iBfZ52mQ5FFq0Mg3Q3WICb9vfzL1aAVQNjhUDXPa3Sk&gid=321984711" as csv
	match (cl:Classification {name: csv.Classification})
	match (p:Plant {id: toInt(csv.ID)})
	create (p)-[:HAS_CLASSIFICATION]->(cl);
	//queries

	//summary of plants per country
	match (c:Country)--(p:Plant)--(cl:Classification)
	return c.name, cl.name, count(p)
	order by c.name, cl.name asc;

	//summary of emissions per country
	MATCH (p:Plant)--(n:Country)
	return n.name, sum(p.co2), sum(p.so2), sum(p.NOx), sum(p.PM25)
	order by n.name;

	//largest co2 polluting plants
	MATCH (p:Plant)--(conn)
	return p,conn
	order by p.co2 desc
	limit 25;

	MATCH (p:Plant)--(conn)
	return p,conn
	order by p.NOx desc
	limit 25;

	MATCH (p:Plant)--(conn)
	return p,conn
	order by p.so2 desc
	limit 25;


	//least efficient plants
	match path = ((c:Classification)--(p:Plant)--(co:Country))
	return co.name, c.name, p.name, p.co2, p.capacity, p.co2/p.capacity
	order by p.co2/p.capacity desc
	limit 25;

	match path = ((c:Classification)--(p:Plant)--(co:Country))
	return path
	order by p.co2/p.capacity desc
	limit 25;

	//most efficient but co2 > 0
	match path = ((c:Classification)--(p:Plant)--(co:Country))
	where p.co2 > 0
	return co.name, c.name, p.name, p.co2, p.capacity, p.co2/p.capacity
	order by p.co2/p.capacity asc
	limit 25;

	match path = ((c:Classification)--(p:Plant)--(co:Country))
	where p.co2 > 0
	return path
	order by p.co2/p.capacity asc
	limit 25;