Skip to content

Instantly share code, notes, and snippets.

@shilad
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shilad/af757ed4824f0f467402 to your computer and use it in GitHub Desktop.
Save shilad/af757ed4824f0f467402 to your computer and use it in GitHub Desktop.
A program to extract geographic information and citations from Wikipedia
package org.wikibrain.cookbook.spatial;
import com.vividsolutions.jts.geom.Geometry;
import org.apache.commons.lang.StringUtils;
import org.wikibrain.conf.ConfigurationException;
import org.wikibrain.core.cmd.Env;
import org.wikibrain.core.cmd.EnvBuilder;
import org.wikibrain.core.dao.DaoException;
import org.wikibrain.core.dao.LocalPageDao;
import org.wikibrain.core.dao.RawPageDao;
import org.wikibrain.core.dao.UniversalPageDao;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.model.LocalPage;
import org.wikibrain.core.model.RawPage;
import org.wikibrain.spatial.core.dao.SpatialDataDao;
import org.wikibrain.utils.ParallelForEach;
import org.wikibrain.utils.Procedure;
import org.wikibrain.utils.WpIOUtils;
import org.wikibrain.utils.WpThreadUtils;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author Shilad Sen
*/
public class CitationAnalyzer {
private final Language language;
private final LocalPageDao pageDao;
private final SpatialDataDao spatialDao;
private final UniversalPageDao conceptDao;
private final RawPageDao rawPageDao;
private Map<LocalPage, Geometry> countries = new HashMap<LocalPage, Geometry>();
public CitationAnalyzer(Env env, Language language) throws ConfigurationException, DaoException {
this.language = language;
this.pageDao = env.getConfigurator().get(LocalPageDao.class);
this.spatialDao = env.getConfigurator().get(SpatialDataDao.class);
this.conceptDao = env.getConfigurator().get(UniversalPageDao.class);
this.rawPageDao = env.getConfigurator().get(RawPageDao.class);
Map<Integer, Geometry> idsToCountries = spatialDao.getAllGeometriesInLayer("country");
for (int conceptId : idsToCountries.keySet()) {
LocalPage country = getLocalPage(conceptId);
if (country != null) {
countries.put(country, idsToCountries.get(conceptId));
}
}
System.out.println("resolved " + countries.size() + " countries");
}
public void createCsv(File file) throws DaoException, IOException {
final Map<Integer, Geometry> geotags = spatialDao.getAllGeometriesInLayer("wikidata");
final BufferedWriter writer = WpIOUtils.openWriter(file);
List<String> fields = Arrays.asList(
"language",
"articleId",
"articleTitle",
"articleLat",
"articleLong",
"countryId",
"countryTitle",
"countryLat",
"countryLong",
"url"
);
writer.write(StringUtils.join(fields, "\t") + "\n");
ParallelForEach.loop(
geotags.keySet(), WpThreadUtils.getMaxThreads(),
new Procedure<Integer>() {
@Override
public void call(Integer conceptId) throws Exception {
writeOneConcept(writer, conceptId, geotags.get(conceptId));
}
},
10000);
writer.close();
}
private void writeOneConcept(BufferedWriter writer, int conceptId, Geometry articleGeo) throws DaoException, IOException {
LocalPage article = getLocalPage(conceptId);
LocalPage country = getContainingCountry(articleGeo);
Geometry countryGeo = countries.get(country);
if (country == null || article == null) {
return;
}
List<String> row = Arrays.asList(
language.getLangCode(),
"" + article.getLocalId(),
"" + article.getTitle(),
"" + articleGeo.getCentroid().getX(),
"" + articleGeo.getCentroid().getY(),
"" + country.getLocalId(),
"" + country.getTitle(),
"" + countryGeo.getCentroid().getX(),
"" + countryGeo.getCentroid().getY(),
"NULL"
);
writer.write(StringUtils.join(row, "\t") + "\n");
RawPage page = rawPageDao.getById(language, article.getLocalId());
for (String url : extractUrls(page.getBody())) {
String cleaned = url.replaceAll("\\s+", " ").trim();
row.set(row.size() - 1, cleaned);
writer.write(StringUtils.join(row, "\t") + "\n");
}
}
private LocalPage getContainingCountry(Geometry point) {
for (LocalPage country : countries.keySet()) {
if (countries.get(country).contains(point)) {
return country;
}
}
return null;
}
/**
* From http://stackoverflow.com/a/1806161/141245
* @param input
* @return
*/
public static List<String> extractUrls(String input) {
List<String> result = new ArrayList<String>();
Pattern pattern = Pattern.compile(
"\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" +
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" +
"|mil|biz|info|mobi|name|aero|jobs|museum" +
"|travel|[a-z]{2}))(:[\\d]{1,5})?" +
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" +
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" +
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" +
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
Matcher matcher = pattern.matcher(input);
while (matcher.find()) {
result.add(matcher.group());
}
return result;
}
private LocalPage getLocalPage(int conceptId) throws DaoException {
int pageId = conceptDao.getLocalId(language, conceptId);
if (pageId < 0) {
return null;
}
return pageDao.getById(language ,pageId);
}
public static void main(String args[]) throws Exception {
Env env = EnvBuilder.envFromArgs(args);
CitationAnalyzer analyzer = new CitationAnalyzer(env, Language.EN);
analyzer.createCsv(new File("citations.tsv"));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment