-
-
Save miriamfs/e1738c7e17ce4a479dbe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>siocTwitterParser</groupId> | |
<artifactId>siocTwitterParser</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<dependencies> | |
<dependency> | |
<groupId>org.apache.jena</groupId> | |
<artifactId>apache-jena-libs</artifactId> | |
<type>pom</type> | |
<version>2.11.1</version> | |
</dependency> | |
<dependency> | |
<groupId>net.sf.json-lib</groupId> | |
<artifactId>json-lib</artifactId> | |
<version>2.4</version> | |
<classifier>jdk15</classifier> | |
</dependency> | |
</dependencies> | |
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package uk.ac.open.kmi.data.parser; | |
import com.hp.hpl.jena.ontology.OntModel; | |
import com.hp.hpl.jena.ontology.OntModelSpec; | |
import com.hp.hpl.jena.rdf.model.Model; | |
import com.hp.hpl.jena.rdf.model.ModelFactory; | |
import com.hp.hpl.jena.rdf.model.Resource; | |
import com.hp.hpl.jena.vocabulary.RDF; | |
import java.io.*; | |
import java.text.SimpleDateFormat; | |
import java.util.ArrayList; | |
import java.util.Properties; | |
import net.sf.json.JSONArray; | |
import net.sf.json.JSONObject; | |
import net.sf.json.JSONSerializer; | |
public class SIOCTwitterParser { | |
private static final String BASE_URL = "https://twitter.com/sioc/"; | |
private static final String SIOC_NS = "http://rdfs.org/sioc/ns#"; | |
private static final String TYPES_NS = "http://rdfs.org/sioc/types#"; | |
private static final String DCTERMS_NS = "http://purl.org/dc/terms/"; | |
private OntModel siocModel; | |
private Model model; | |
private String siocOntologyFolder; | |
private String jsonInputFile; | |
private String rdfOutputFile; | |
// ----------------------- | |
// Creating the ontology models | |
// ----------------------- | |
public SIOCTwitterParser(String propertiesFile) { | |
readProperties(propertiesFile); | |
//Create the model and set up the namespaces | |
this.model = ModelFactory.createDefaultModel(); | |
this.model.setNsPrefix("sioc", SIOC_NS); | |
this.model.setNsPrefix("dcterms", DCTERMS_NS); | |
this.model.setNsPrefix("types", TYPES_NS); | |
//Load the SIOC ontology | |
System.out.println("Loading the SIOC ontology"); | |
this.siocModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM); | |
loadModel(this.siocModel, new File(this.siocOntologyFolder), "", "RDF/XML"); | |
} | |
// ----------------------- | |
// Reading input/output for the program | |
// ----------------------- | |
public void readProperties(String propertiesFile) { | |
try { | |
Properties properties = new Properties(); | |
properties.load(new FileInputStream(propertiesFile)); | |
this.siocOntologyFolder = properties.getProperty("siocOntologyFolder"); | |
this.jsonInputFile = properties.getProperty("jsonInputFile"); | |
this.rdfOutputFile = properties.getProperty("rdfOutputFile"); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
private void loadModel(OntModel ontModel, File repositoryPath, String base, String format) { | |
try { | |
//Read the ontology from the corresponding files | |
if (repositoryPath.isDirectory()) { | |
for (File file : repositoryPath.listFiles()) { | |
try { | |
ontModel.read(new FileInputStream(file), base, format); | |
} catch (FileNotFoundException e) { | |
System.out.println("Error: The file " + file.getName() + " can not be added to the model "); | |
} | |
} | |
} else { | |
try { | |
System.out.println("FILE: " + repositoryPath.getAbsolutePath()); | |
ontModel.read(new FileInputStream(repositoryPath), base, format); | |
} catch (FileNotFoundException e) { | |
System.out.println("Error: The file " + repositoryPath + " can not be added to the model "); | |
} | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
// ----------------------- | |
// Writing the ontology model | |
// ----------------------- | |
public void printOntologyModel() { | |
try { | |
FileWriter out = new FileWriter(this.rdfOutputFile); | |
model.write(out, "RDF/XML-ABBREV"); | |
out.close(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
// ----------------------- | |
// Load and parse the Twitter JSON | |
// ----------------------- | |
public String readJSON() { | |
System.out.println("Obtaining Twitter Data"); | |
StringBuffer json = new StringBuffer(); | |
try { | |
BufferedReader reader = new BufferedReader(new FileReader(this.jsonInputFile)); | |
String line = reader.readLine(); | |
while (line != null) { | |
json.append(line); | |
line = reader.readLine(); | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
return json.toString(); | |
} | |
public void parsePostList(String json) { | |
System.out.println("Transforming Twitter data into SIOC "); | |
try { | |
JSONObject jsonObject = (JSONObject) JSONSerializer.toJSON(json); | |
JSONArray jsonPostsList = jsonObject.getJSONArray("statuses"); | |
for (int i = 0; i < jsonPostsList.size(); i++) { | |
parsePost(jsonPostsList.getJSONObject(i)); | |
} | |
} catch (Exception e) { | |
System.out.println("no posts " + json); | |
e.printStackTrace(); | |
} | |
} | |
public void parsePost(JSONObject post) { | |
try { | |
//Create the post object | |
String id_post_str = post.getString("id_str"); | |
Resource postResource = this.model.createResource(BASE_URL + "post/" + id_post_str); | |
postResource.addProperty(RDF.type, this.siocModel.getResource(TYPES_NS + "MicroblogPost")); | |
parsePostData(postResource, post); | |
parsePostEntities(postResource, post); | |
//Create the user object | |
JSONObject user = post.getJSONObject("user"); | |
String id_user_str = user.getString("id_str"); | |
Resource userResource = this.model.createResource(BASE_URL + "user/" + id_user_str); | |
userResource.addProperty(RDF.type, this.siocModel.getResource(SIOC_NS + "UserAccount")); | |
parseUserData(userResource, user); | |
//link the post with its creator | |
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "creator_of"), postResource); | |
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "has_creator"), userResource); | |
//Obtain information about the reply chain. Note we are not distinguishing between reply and retweet! | |
//How will you do it? ;) | |
String parent_id_str = post.getString("in_reply_to_status_id_str"); | |
if (parent_id_str != null && !parent_id_str.equalsIgnoreCase("null")) { | |
Resource postParentResource = this.model.createResource(BASE_URL + "post/" + parent_id_str); | |
postParentResource.addProperty(RDF.type, this.siocModel.getResource(TYPES_NS + "MicroblogPost")); | |
postParentResource.addProperty(this.siocModel.getProperty(SIOC_NS + "has_reply"), postResource); | |
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "reply_of"), postParentResource); | |
} | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
private void parsePostData(Resource postResource, JSONObject post) { | |
try { | |
String text = post.getString("text"); | |
text = text.replace("\"", "'"); | |
String created_at = post.getString("created_at"); | |
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "content"), text); | |
postResource.addProperty(this.siocModel.getProperty(DCTERMS_NS + "date"), created_at); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
private void parsePostEntities(Resource postResource, JSONObject post) { | |
//Obtain information about the entities within the post | |
JSONObject entities = post.getJSONObject("entities"); | |
ArrayList<String> userMentionList = new ArrayList<String>(); | |
if (entities != null) { | |
//get hashtags | |
JSONArray tagListJSON = entities.getJSONArray("hashtags"); | |
for (int i = 0; i < tagListJSON.size(); i++) { | |
String tag = tagListJSON.getJSONObject(i).getString("text"); | |
Resource tagResource = this.model.createResource(BASE_URL + "tag/" + tag.toLowerCase()); | |
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "topic"), tagResource); | |
} | |
//get user mentions. Note user mentions are not included in the model | |
//How will you do it? ;) | |
JSONArray userMentionListJSON = entities.getJSONArray("user_mentions"); | |
for (int i = 0; i < userMentionListJSON.size(); i++) { | |
String userId = userMentionListJSON.getJSONObject(i).getString("id_str"); | |
userMentionList.add(userId); | |
} | |
//get urls | |
JSONArray urlListJSON = entities.getJSONArray("urls"); | |
for (int i = 0; i < urlListJSON.size(); i++) { | |
String url = urlListJSON.getJSONObject(i).getString("expanded_url"); | |
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "links_to"), url); | |
} | |
} | |
} | |
private void parseUserData(Resource userResource, JSONObject user) { | |
try { | |
String name = user.getString("name"); | |
String screen_name = user.getString("screen_name"); | |
String description = user.getString("description"); | |
String url_user = user.getString("url"); | |
String user_created_at = user.getString("created_at"); | |
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "name"), screen_name); | |
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "title"), name); | |
userResource.addProperty(this.siocModel.getProperty(DCTERMS_NS + "created"), user_created_at); | |
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "note"), description); | |
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "avatar"), url_user); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
// ----------------------- | |
// main | |
// ----------------------- | |
public static void main(String[] args) throws Exception { | |
String propertiesFile; | |
if (args.length < 1) { | |
System.out.println("*** Input required: path to the .properties file [e.g. java -jar SIOCTwitterParser.jar ./siocTwitterParser.properties] ***"); | |
System.out.println(); | |
System.out.println("== Format of the properties file ==="); | |
System.out.println(); | |
System.out.println("siocFolder=/tutorials/2014_ESWC/code/project_source/src/main/resources/sioc"); | |
System.out.println("jsonInputFile=/tutorials/2014_ESWC/code/project_source/src/main/resources/eswc.json"); | |
System.out.println("rdfOutputFile=/tutorials/2014_ESWC/code/project_source/src/main/resources/output.rdf"); | |
} else { | |
propertiesFile = args[0]; | |
SIOCTwitterParser parser = new SIOCTwitterParser(propertiesFile); | |
String json = parser.readJSON(); | |
parser.parsePostList(json); | |
parser.printOntologyModel(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
siocOntologyFolder=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/sioc | |
jsonInputFile=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/eswc.json | |
rdfOutputFile=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/output.rdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
his Gists contains three main files:
(1) SIOCTWitterParser.java: contains the code that you need to parse Twitter data and transform it into SIOC format
(2) pom.xml: contains the dependencies. If you prefer not to use a maven project, just download the corresponding libraries
(3) siocTwitterParser.properties: this is the properties file that you need to set up, including:
- siocOntologyFolder. This is the local folder in your computer where you store the SIOC ontology
jsonInputFile. This is an example of a Twitter JSON file. Note that you can can directly connect, download Twitter data and transform it! :)
- rdfOutputFile. This is the output file containing SIOC transformed Twitter data