Last active
August 1, 2016 21:58
-
-
Save serhii-shnurenko/d696647999466d3f000465940d2869a7 to your computer and use it in GitHub Desktop.
IMDB extractor sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
duration: 2h 10min | |
year: 2016 | |
director: David Ayer | |
name: Suicide Squad | |
---------------- | |
duration: 1h 28min | |
year: 1969 | |
director: Tsun-Shou Sung | |
name: Tie niang zi | |
---------------- | |
duration: 2h | |
year: 1992 | |
director: André Flédérick | |
name: Tiercé gagnant | |
---------------- | |
duration: 1h 27min | |
year: 1995 | |
director: Jonathan Morgan | |
name: Tit Tease 1 | |
---------------- | |
duration: 10min | |
year: 1918 | |
name: Too Many Husbands | |
---------------- | |
year: 2009 | |
name: Top Ten 2 | |
---------------- | |
duration: 1h 58min | |
year: 2012 | |
director: Len Wiseman | |
name: Згадати все | |
---------------- | |
year: 1996 | |
director: Jonathan Morgan | |
name: Totally Depraved 1 | |
---------------- | |
year: 1918 | |
name: The Valley of the Dordogne, France | |
---------------- | |
year: 1918 | |
name: With the Deep Sea Anglers | |
---------------- | |
duration: 2h 15min | |
year: 2008 | |
name: A Woman's Orgasm 2 | |
---------------- | |
year: 1910 | |
director: Fred J. Balshofer | |
name: The Wrong Trail | |
---------------- | |
year: 1918 | |
name: Yesterdays in Samoa | |
---------------- | |
year: 2008 | |
director: Gazzman | |
name: Young Harlots: Finishing School | |
---------------- | |
year: 1978 | |
director: Bruno Gantillon | |
name: Zigzags | |
---------------- | |
year: 1978 | |
director: Jean-Marie Marcel | |
name: À l'ombre d'un soupçon | |
---------------- | |
duration: 26min | |
name: Cerná sanitka | |
---------------- | |
duration: 30min | |
name: Daisuki! Itsutsugo | |
---------------- | |
name: Extra | |
---------------- | |
name: Fantàstic | |
---------------- | |
duration: 30min | |
name: Yonaoshi baraetî: Kangorongo | |
---------------- | |
name: Kun maailma paloi | |
---------------- | |
duration: 30min | |
name: Matsu Shin | |
---------------- | |
duration: 30min | |
name: Matsumoto Shinsuke | |
---------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Map; | |
/** | |
* Created by Сергій Шнуренко on 01.08.2016. | |
*/ | |
public class Playground { | |
private static int startID = 1386697; | |
private static int finishID = 1386720; | |
public static void main(String[] args) throws IOException { | |
for(int i = startID; i<=finishID;i++){ | |
printMovieInfo(i); | |
System.out.println("----------------"); | |
} | |
} | |
public static void printMovieInfo(int id) throws IOException{ | |
HashMap<String,String> movieData = new HashMap<String, String>(); | |
Document doc = Jsoup.connect(String.format("http://www.imdb.com/title/tt%07d/",id)).get(); | |
Element name = doc.select("div.title_wrapper").select("h1").first(); | |
Element year = name.select("a").first(); | |
Element duration = doc.select("time").first(); | |
Element director = doc.select("span[itemprop=\"director\"]>a>span[itemprop=\"name\"]").first(); | |
if(name!=null) | |
movieData.put("name",name.ownText()); | |
if(year!=null) | |
movieData.put("year",year.ownText()); | |
if(duration!=null) | |
movieData.put("duration",duration.ownText()); | |
if(director!=null) | |
movieData.put("director", director.ownText()); | |
printMap(movieData); | |
} | |
private static void printMap(HashMap<String, String> hMap){ | |
for(Map.Entry<String, String> mapEntry : hMap.entrySet()){ | |
System.out.println(mapEntry.getKey()+": "+mapEntry.getValue()); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8"?> | |
<project xmlns="http://maven.apache.org/POM/4.0.0" | |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
<modelVersion>4.0.0</modelVersion> | |
<groupId>home</groupId> | |
<artifactId>imdb_scrubber</artifactId> | |
<version>1.0-SNAPSHOT</version> | |
<dependencies> | |
<dependency> | |
<groupId>org.jsoup</groupId> | |
<artifactId>jsoup</artifactId> | |
<version>1.8.3</version> | |
</dependency> | |
</dependencies> | |
</project> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment