Last active
August 29, 2015 14:02
-
-
Save shilad/af757ed4824f0f467402 to your computer and use it in GitHub Desktop.
A program to extract geographic information and citations from Wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.wikibrain.cookbook.spatial; | |
import com.vividsolutions.jts.geom.Geometry; | |
import org.apache.commons.lang.StringUtils; | |
import org.wikibrain.conf.ConfigurationException; | |
import org.wikibrain.core.cmd.Env; | |
import org.wikibrain.core.cmd.EnvBuilder; | |
import org.wikibrain.core.dao.DaoException; | |
import org.wikibrain.core.dao.LocalPageDao; | |
import org.wikibrain.core.dao.RawPageDao; | |
import org.wikibrain.core.dao.UniversalPageDao; | |
import org.wikibrain.core.lang.Language; | |
import org.wikibrain.core.model.LocalPage; | |
import org.wikibrain.core.model.RawPage; | |
import org.wikibrain.spatial.core.dao.SpatialDataDao; | |
import org.wikibrain.utils.ParallelForEach; | |
import org.wikibrain.utils.Procedure; | |
import org.wikibrain.utils.WpIOUtils; | |
import org.wikibrain.utils.WpThreadUtils; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.*; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* @author Shilad Sen | |
*/ | |
public class CitationAnalyzer { | |
private final Language language; | |
private final LocalPageDao pageDao; | |
private final SpatialDataDao spatialDao; | |
private final UniversalPageDao conceptDao; | |
private final RawPageDao rawPageDao; | |
private Map<LocalPage, Geometry> countries = new HashMap<LocalPage, Geometry>(); | |
public CitationAnalyzer(Env env, Language language) throws ConfigurationException, DaoException { | |
this.language = language; | |
this.pageDao = env.getConfigurator().get(LocalPageDao.class); | |
this.spatialDao = env.getConfigurator().get(SpatialDataDao.class); | |
this.conceptDao = env.getConfigurator().get(UniversalPageDao.class); | |
this.rawPageDao = env.getConfigurator().get(RawPageDao.class); | |
Map<Integer, Geometry> idsToCountries = spatialDao.getAllGeometriesInLayer("country"); | |
for (int conceptId : idsToCountries.keySet()) { | |
LocalPage country = getLocalPage(conceptId); | |
if (country != null) { | |
countries.put(country, idsToCountries.get(conceptId)); | |
} | |
} | |
System.out.println("resolved " + countries.size() + " countries"); | |
} | |
public void createCsv(File file) throws DaoException, IOException { | |
final Map<Integer, Geometry> geotags = spatialDao.getAllGeometriesInLayer("wikidata"); | |
final BufferedWriter writer = WpIOUtils.openWriter(file); | |
List<String> fields = Arrays.asList( | |
"language", | |
"articleId", | |
"articleTitle", | |
"articleLat", | |
"articleLong", | |
"countryId", | |
"countryTitle", | |
"countryLat", | |
"countryLong", | |
"url" | |
); | |
writer.write(StringUtils.join(fields, "\t") + "\n"); | |
ParallelForEach.loop( | |
geotags.keySet(), WpThreadUtils.getMaxThreads(), | |
new Procedure<Integer>() { | |
@Override | |
public void call(Integer conceptId) throws Exception { | |
writeOneConcept(writer, conceptId, geotags.get(conceptId)); | |
} | |
}, | |
10000); | |
writer.close(); | |
} | |
private void writeOneConcept(BufferedWriter writer, int conceptId, Geometry articleGeo) throws DaoException, IOException { | |
LocalPage article = getLocalPage(conceptId); | |
LocalPage country = getContainingCountry(articleGeo); | |
Geometry countryGeo = countries.get(country); | |
if (country == null || article == null) { | |
return; | |
} | |
List<String> row = Arrays.asList( | |
language.getLangCode(), | |
"" + article.getLocalId(), | |
"" + article.getTitle(), | |
"" + articleGeo.getCentroid().getX(), | |
"" + articleGeo.getCentroid().getY(), | |
"" + country.getLocalId(), | |
"" + country.getTitle(), | |
"" + countryGeo.getCentroid().getX(), | |
"" + countryGeo.getCentroid().getY(), | |
"NULL" | |
); | |
writer.write(StringUtils.join(row, "\t") + "\n"); | |
RawPage page = rawPageDao.getById(language, article.getLocalId()); | |
for (String url : extractUrls(page.getBody())) { | |
String cleaned = url.replaceAll("\\s+", " ").trim(); | |
row.set(row.size() - 1, cleaned); | |
writer.write(StringUtils.join(row, "\t") + "\n"); | |
} | |
} | |
private LocalPage getContainingCountry(Geometry point) { | |
for (LocalPage country : countries.keySet()) { | |
if (countries.get(country).contains(point)) { | |
return country; | |
} | |
} | |
return null; | |
} | |
/** | |
* From http://stackoverflow.com/a/1806161/141245 | |
* @param input | |
* @return | |
*/ | |
public static List<String> extractUrls(String input) { | |
List<String> result = new ArrayList<String>(); | |
Pattern pattern = Pattern.compile( | |
"\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" + | |
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" + | |
"|mil|biz|info|mobi|name|aero|jobs|museum" + | |
"|travel|[a-z]{2}))(:[\\d]{1,5})?" + | |
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" + | |
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + | |
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" + | |
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + | |
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" + | |
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b"); | |
Matcher matcher = pattern.matcher(input); | |
while (matcher.find()) { | |
result.add(matcher.group()); | |
} | |
return result; | |
} | |
private LocalPage getLocalPage(int conceptId) throws DaoException { | |
int pageId = conceptDao.getLocalId(language, conceptId); | |
if (pageId < 0) { | |
return null; | |
} | |
return pageDao.getById(language ,pageId); | |
} | |
public static void main(String args[]) throws Exception { | |
Env env = EnvBuilder.envFromArgs(args); | |
CitationAnalyzer analyzer = new CitationAnalyzer(env, Language.EN); | |
analyzer.createCsv(new File("citations.tsv")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment