Skip to content

Instantly share code, notes, and snippets.

@thomasjungblut
Created November 23, 2012 05:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomasjungblut/4134189 to your computer and use it in GitHub Desktop.
Save thomasjungblut/4134189 to your computer and use it in GitHub Desktop.
Bacon Generator
package de.jungblut.bacon;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import com.google.common.collect.HashMultimap;
/**
* Graph generator for Apache Hama's SSSP. Prints out a tab separated text file
* of adjacent actors and a lookup file for ids of the actors.
*
* @author thomas.jungblut
*
*/
public final class GraphGen {
private static final String IMDB_FILES_DIR = "files/imdb/";
private static final String GRAPH_IN_ACTORS_TXT = "files/imdb/graph-in/actors.txt";
private static final String GRAPH_IN_ADJACENT_ACTORS_TXT = "files/imdb/graph-in/adjacent_actors.txt";
private static final String START_LINE = "----\t\t\t------";
private static final String END_LINE = "-----------------------------------------------------------------------------";
private static final Pattern SPLIT_PATTERN = Pattern.compile("\t\t\t");
private static final Pattern ALTERNATIVE_SPLIT_PATTERN = Pattern
.compile("\t\t");
public static void main(String[] args) {
// movie -> actors
HashMultimap<String, String> collaborationMap = HashMultimap.create();
// actor -> movies
HashMultimap<String, String> actorMap = HashMultimap.create();
readActors(collaborationMap, actorMap, "actors.list");
readActors(collaborationMap, actorMap, "actresses.list");
// now we loop over all actors and determine their collaborators
Set<String> actorSet = actorMap.keySet();
String[] actorArray = actorSet.toArray(new String[actorSet.size()]);
actorSet = null;
Arrays.sort(actorArray);
System.out.println(actorArray.length);
try (BufferedWriter bw = new BufferedWriter(new FileWriter(
GRAPH_IN_ADJACENT_ACTORS_TXT))) {
for (int i = 0; i < actorArray.length; i++) {
Set<String> movies = actorMap.get(actorArray[i]);
Set<String> adjacentActors = new HashSet<>();
for (String movie : movies) {
// TODO actually you could save the movie over which they are adjacent
// (edge value)
adjacentActors.addAll(collaborationMap.get(movie));
}
adjacentActors.remove(actorArray[i]);
StringBuilder sb = new StringBuilder();
sb.append(i);
sb.append('\t');
for (String adjacent : adjacentActors) {
sb.append(Arrays.binarySearch(actorArray, adjacent));
sb.append('\t');
}
sb.append('\n');
bw.write(sb.toString());
if (i % 10000 == 0) {
System.out.println(i);
}
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Flush actor lookups");
try (BufferedWriter bw = new BufferedWriter(new FileWriter(
GRAPH_IN_ACTORS_TXT))) {
for (int i = 0; i < actorArray.length; i++) {
bw.write(i + "\t" + actorArray[i] + "\n");
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void readActors(HashMultimap<String, String> collaborationMap,
HashMultimap<String, String> actorMap, String fileName) {
try (BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(IMDB_FILES_DIR + fileName), "ISO8859-1"))) {
boolean start = false;
String line = null;
String currentActor = null;
while ((line = br.readLine()) != null) {
if (!start) {
if (line.equals(START_LINE)) {
start = true;
}
} else {
if (line.isEmpty()) {
currentActor = null;
} else if (line.equals(END_LINE)) {
break;
} else {
// really? who has implemented this crazy format?
String[] split = SPLIT_PATTERN.split(line);
if (split.length != 2) {
split = ALTERNATIVE_SPLIT_PATTERN.split(line);
if (split.length != 2) {
split = line.split("\t");
}
}
if (currentActor == null) {
currentActor = split[0];
}
if (split.length < 2) {
System.out.println("Couldn't parse line correctly: " + line);
continue;
}
String normalizedMovieName = split[1].substring(0,
split[1].indexOf(")") + 1);
collaborationMap.put(normalizedMovieName, currentActor);
actorMap.put(currentActor, normalizedMovieName);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment