Skip to content

Instantly share code, notes, and snippets.

@miriamfs
Last active August 29, 2015 14:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save miriamfs/e1738c7e17ce4a479dbe to your computer and use it in GitHub Desktop.
Save miriamfs/e1738c7e17ce4a479dbe to your computer and use it in GitHub Desktop.
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>siocTwitterParser</groupId>
<artifactId>siocTwitterParser</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
<type>pom</type>
<version>2.11.1</version>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.4</version>
<classifier>jdk15</classifier>
</dependency>
</dependencies>
</project>
package uk.ac.open.kmi.data.parser;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.OntModelSpec;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.ModelFactory;
import com.hp.hpl.jena.rdf.model.Resource;
import com.hp.hpl.jena.vocabulary.RDF;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Properties;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import net.sf.json.JSONSerializer;
public class SIOCTwitterParser {
private static final String BASE_URL = "https://twitter.com/sioc/";
private static final String SIOC_NS = "http://rdfs.org/sioc/ns#";
private static final String TYPES_NS = "http://rdfs.org/sioc/types#";
private static final String DCTERMS_NS = "http://purl.org/dc/terms/";
private OntModel siocModel;
private Model model;
private String siocOntologyFolder;
private String jsonInputFile;
private String rdfOutputFile;
// -----------------------
// Creating the ontology models
// -----------------------
public SIOCTwitterParser(String propertiesFile) {
readProperties(propertiesFile);
//Create the model and set up the namespaces
this.model = ModelFactory.createDefaultModel();
this.model.setNsPrefix("sioc", SIOC_NS);
this.model.setNsPrefix("dcterms", DCTERMS_NS);
this.model.setNsPrefix("types", TYPES_NS);
//Load the SIOC ontology
System.out.println("Loading the SIOC ontology");
this.siocModel = ModelFactory.createOntologyModel(OntModelSpec.OWL_MEM);
loadModel(this.siocModel, new File(this.siocOntologyFolder), "", "RDF/XML");
}
// -----------------------
// Reading input/output for the program
// -----------------------
public void readProperties(String propertiesFile) {
try {
Properties properties = new Properties();
properties.load(new FileInputStream(propertiesFile));
this.siocOntologyFolder = properties.getProperty("siocOntologyFolder");
this.jsonInputFile = properties.getProperty("jsonInputFile");
this.rdfOutputFile = properties.getProperty("rdfOutputFile");
} catch (Exception e) {
e.printStackTrace();
}
}
private void loadModel(OntModel ontModel, File repositoryPath, String base, String format) {
try {
//Read the ontology from the corresponding files
if (repositoryPath.isDirectory()) {
for (File file : repositoryPath.listFiles()) {
try {
ontModel.read(new FileInputStream(file), base, format);
} catch (FileNotFoundException e) {
System.out.println("Error: The file " + file.getName() + " can not be added to the model ");
}
}
} else {
try {
System.out.println("FILE: " + repositoryPath.getAbsolutePath());
ontModel.read(new FileInputStream(repositoryPath), base, format);
} catch (FileNotFoundException e) {
System.out.println("Error: The file " + repositoryPath + " can not be added to the model ");
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------
// Writing the ontology model
// -----------------------
public void printOntologyModel() {
try {
FileWriter out = new FileWriter(this.rdfOutputFile);
model.write(out, "RDF/XML-ABBREV");
out.close();
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------
// Load and parse the Twitter JSON
// -----------------------
public String readJSON() {
System.out.println("Obtaining Twitter Data");
StringBuffer json = new StringBuffer();
try {
BufferedReader reader = new BufferedReader(new FileReader(this.jsonInputFile));
String line = reader.readLine();
while (line != null) {
json.append(line);
line = reader.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
return json.toString();
}
public void parsePostList(String json) {
System.out.println("Transforming Twitter data into SIOC ");
try {
JSONObject jsonObject = (JSONObject) JSONSerializer.toJSON(json);
JSONArray jsonPostsList = jsonObject.getJSONArray("statuses");
for (int i = 0; i < jsonPostsList.size(); i++) {
parsePost(jsonPostsList.getJSONObject(i));
}
} catch (Exception e) {
System.out.println("no posts " + json);
e.printStackTrace();
}
}
public void parsePost(JSONObject post) {
try {
//Create the post object
String id_post_str = post.getString("id_str");
Resource postResource = this.model.createResource(BASE_URL + "post/" + id_post_str);
postResource.addProperty(RDF.type, this.siocModel.getResource(TYPES_NS + "MicroblogPost"));
parsePostData(postResource, post);
parsePostEntities(postResource, post);
//Create the user object
JSONObject user = post.getJSONObject("user");
String id_user_str = user.getString("id_str");
Resource userResource = this.model.createResource(BASE_URL + "user/" + id_user_str);
userResource.addProperty(RDF.type, this.siocModel.getResource(SIOC_NS + "UserAccount"));
parseUserData(userResource, user);
//link the post with its creator
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "creator_of"), postResource);
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "has_creator"), userResource);
//Obtain information about the reply chain. Note we are not distinguishing between reply and retweet!
//How will you do it? ;)
String parent_id_str = post.getString("in_reply_to_status_id_str");
if (parent_id_str != null && !parent_id_str.equalsIgnoreCase("null")) {
Resource postParentResource = this.model.createResource(BASE_URL + "post/" + parent_id_str);
postParentResource.addProperty(RDF.type, this.siocModel.getResource(TYPES_NS + "MicroblogPost"));
postParentResource.addProperty(this.siocModel.getProperty(SIOC_NS + "has_reply"), postResource);
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "reply_of"), postParentResource);
}
} catch (Exception e) {
e.printStackTrace();
}
}
private void parsePostData(Resource postResource, JSONObject post) {
try {
String text = post.getString("text");
text = text.replace("\"", "'");
String created_at = post.getString("created_at");
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "content"), text);
postResource.addProperty(this.siocModel.getProperty(DCTERMS_NS + "date"), created_at);
} catch (Exception e) {
e.printStackTrace();
}
}
private void parsePostEntities(Resource postResource, JSONObject post) {
//Obtain information about the entities within the post
JSONObject entities = post.getJSONObject("entities");
ArrayList<String> userMentionList = new ArrayList<String>();
if (entities != null) {
//get hashtags
JSONArray tagListJSON = entities.getJSONArray("hashtags");
for (int i = 0; i < tagListJSON.size(); i++) {
String tag = tagListJSON.getJSONObject(i).getString("text");
Resource tagResource = this.model.createResource(BASE_URL + "tag/" + tag.toLowerCase());
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "topic"), tagResource);
}
//get user mentions. Note user mentions are not included in the model
//How will you do it? ;)
JSONArray userMentionListJSON = entities.getJSONArray("user_mentions");
for (int i = 0; i < userMentionListJSON.size(); i++) {
String userId = userMentionListJSON.getJSONObject(i).getString("id_str");
userMentionList.add(userId);
}
//get urls
JSONArray urlListJSON = entities.getJSONArray("urls");
for (int i = 0; i < urlListJSON.size(); i++) {
String url = urlListJSON.getJSONObject(i).getString("expanded_url");
postResource.addProperty(this.siocModel.getProperty(SIOC_NS + "links_to"), url);
}
}
}
private void parseUserData(Resource userResource, JSONObject user) {
try {
String name = user.getString("name");
String screen_name = user.getString("screen_name");
String description = user.getString("description");
String url_user = user.getString("url");
String user_created_at = user.getString("created_at");
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "name"), screen_name);
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "title"), name);
userResource.addProperty(this.siocModel.getProperty(DCTERMS_NS + "created"), user_created_at);
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "note"), description);
userResource.addProperty(this.siocModel.getProperty(SIOC_NS + "avatar"), url_user);
} catch (Exception e) {
e.printStackTrace();
}
}
// -----------------------
// main
// -----------------------
public static void main(String[] args) throws Exception {
String propertiesFile;
if (args.length < 1) {
System.out.println("*** Input required: path to the .properties file [e.g. java -jar SIOCTwitterParser.jar ./siocTwitterParser.properties] ***");
System.out.println();
System.out.println("== Format of the properties file ===");
System.out.println();
System.out.println("siocFolder=/tutorials/2014_ESWC/code/project_source/src/main/resources/sioc");
System.out.println("jsonInputFile=/tutorials/2014_ESWC/code/project_source/src/main/resources/eswc.json");
System.out.println("rdfOutputFile=/tutorials/2014_ESWC/code/project_source/src/main/resources/output.rdf");
} else {
propertiesFile = args[0];
SIOCTwitterParser parser = new SIOCTwitterParser(propertiesFile);
String json = parser.readJSON();
parser.parsePostList(json);
parser.printOntologyModel();
}
}
}
siocOntologyFolder=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/sioc
jsonInputFile=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/eswc.json
rdfOutputFile=/slides/tutorials/2014_ESWC/code/project_source/src/main/resources/output.rdf
@miriamfs
Copy link
Author

his Gists contains three main files:

(1) SIOCTWitterParser.java: contains the code that you need to parse Twitter data and transform it into SIOC format

(2) pom.xml: contains the dependencies. If you prefer not to use a maven project, just download the corresponding libraries

(3) siocTwitterParser.properties: this is the properties file that you need to set up, including:
- siocOntologyFolder. This is the local folder in your computer where you store the SIOC ontology
jsonInputFile. This is an example of a Twitter JSON file. Note that you can can directly connect, download Twitter data and transform it! :)
- rdfOutputFile. This is the output file containing SIOC transformed Twitter data

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment