Skip to content

Instantly share code, notes, and snippets.

@ClickerMonkey
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ClickerMonkey/f481b31c8898b55ff4f9 to your computer and use it in GitHub Desktop.
Save ClickerMonkey/f481b31c8898b55ff4f9 to your computer and use it in GitHub Desktop.
Gathers information about all articles on Wikipedia and builds a graph of all related articles.
public class Wikipedia
{
public static class Article implements Serializable {
private static final long serialVersionUID = 1L;
public int id;
public String title;
public Set<String> related = new HashSet<String>();
}
private static enum NodeType {
mediawiki, page, ns, id, revision, title, text, unknown
}
private static Map<String, NodeType> nodeMap = new HashMap<String, NodeType>() {{
put("mediawiki", NodeType.mediawiki);
put("page", NodeType.page);
put("ns", NodeType.ns);
put("id", NodeType.id);
put("revision", NodeType.revision);
put("title", NodeType.title);
put("text", NodeType.text);
}};
public static void main(String[] args) throws Exception
{
String location = "C:\\Users\\pdiffenderfer\\Downloads\\enwiki-20140304-pages-articles-multistream.xml";
String out = "./wikipedia.txt";
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
// mediawiki>page>
// ns=0
// title
// id
// revision>text
final Stack<NodeType> stack = new Stack<Wikipedia.NodeType>();
final Map<String, Article> articles = new HashMap<String, Wikipedia.Article>();
final PrintStream stream = new PrintStream( out );
final AtomicLong read = new AtomicLong();
DefaultHandler handler = new DefaultHandler() {
Article current;
StringBuilder text = new StringBuilder();
int ns;
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
NodeType type = nodeMap.get( qName );
if (type == null) {
type = NodeType.unknown;
}
stack.push( type );
if (type == NodeType.page) {
current = new Article();
}
}
public void endElement(String uri, String localName, String qName) throws SAXException {
NodeType popped = stack.pop();
if (popped == NodeType.page) {
if (ns == 0) {
articles.put( current.title, current );
read.incrementAndGet();
if (articles.size() == 10000) {
for (Article a : articles.values()) {
print(stream, a);
}
articles.clear();
System.out.println("Articles Read: " + read.get());
}
int start = text.indexOf( "[[" );
while (start != -1) {
int end = consumeUntil( text, "]]", start + 2 );
String r = text.substring( start + 2, end );
if (r.indexOf( ':' ) == -1 && r.indexOf( '#' ) == -1 && r.indexOf( "{" ) == -1 && r.indexOf( '/' ) != 0) {
int rr = r.indexOf( '|' );
if (rr != -1) {
r = r.substring( 0, rr );
}
current.related.add( r );
}
if (end == text.length()) {
break;
}
start = text.indexOf( "[[", end );
}
text.setLength( 0 );
}
current = null;
}
}
public void characters(char ch[], int start, int length) throws SAXException {
NodeType currentType = stack.peek();
switch (currentType) {
case id:
current.id = Integer.valueOf( String.valueOf( ch, start, length ) );
break;
case ns:
ns = Integer.valueOf( String.valueOf( ch, start, length ) );
break;
case title:
current.title = String.valueOf( ch, start, length );
break;
case text:
text.append( ch, start, length );
break;
default:
break;
}
}
};
saxParser.parse(location, handler);
for (Article a : articles.values()) {
print(stream, a);
}
stream.close();
System.out.println("Articles Read: " + read.get());
}
private static void print(PrintStream out, Article a)
{
out.format( "%d>%s>%d", a.id, a.title, a.related.size() );
for (String x : a.related) {
out.print( '>' );
out.print( x );
}
out.println();
}
private static int consumeUntil(StringBuilder haystack, String needle, int start)
{
final char[] needleChars = needle.toCharArray();
final int needleLength = needleChars.length;
final int max = haystack.length() - needleLength;
while (start <= max) {
boolean match = true;
for (int i = 0; i < needleLength; i++) {
match &= (needleChars[i] == haystack.charAt( i + start ));
}
if (match) {
break;
}
start++;
}
return start;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment