Skip to content

Instantly share code, notes, and snippets.

@serhii-shnurenko
Last active August 1, 2016 21:58
Show Gist options
  • Save serhii-shnurenko/d696647999466d3f000465940d2869a7 to your computer and use it in GitHub Desktop.
Save serhii-shnurenko/d696647999466d3f000465940d2869a7 to your computer and use it in GitHub Desktop.
IMDB extractor sample
duration: 2h 10min
year: 2016
director: David Ayer
name: Suicide Squad 
----------------
duration: 1h 28min
year: 1969
director: Tsun-Shou Sung
name: Tie niang zi 
----------------
duration: 2h
year: 1992
director: André Flédérick
name: Tiercé gagnant 
----------------
duration: 1h 27min
year: 1995
director: Jonathan Morgan
name: Tit Tease 1 
----------------
duration: 10min
year: 1918
name: Too Many Husbands 
----------------
year: 2009
name: Top Ten 2 
----------------
duration: 1h 58min
year: 2012
director: Len Wiseman
name: Згадати все 
----------------
year: 1996
director: Jonathan Morgan
name: Totally Depraved 1 
----------------
year: 1918
name: The Valley of the Dordogne, France 
----------------
year: 1918
name: With the Deep Sea Anglers 
----------------
duration: 2h 15min
year: 2008
name: A Woman's Orgasm 2 
----------------
year: 1910
director: Fred J. Balshofer
name: The Wrong Trail 
----------------
year: 1918
name: Yesterdays in Samoa 
----------------
year: 2008
director: Gazzman
name: Young Harlots: Finishing School 
----------------
year: 1978
director: Bruno Gantillon
name: Zigzags 
----------------
year: 1978
director: Jean-Marie Marcel
name: À l'ombre d'un soupçon 
----------------
duration: 26min
name: Cerná sanitka 
----------------
duration: 30min
name: Daisuki! Itsutsugo 
----------------
name: Extra 
----------------
name: Fantàstic 
----------------
duration: 30min
name: Yonaoshi baraetî: Kangorongo 
----------------
name: Kun maailma paloi 
----------------
duration: 30min
name: Matsu Shin 
----------------
duration: 30min
name: Matsumoto Shinsuke 
----------------
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* Created by Сергій Шнуренко on 01.08.2016.
*/
public class Playground {
private static int startID = 1386697;
private static int finishID = 1386720;
public static void main(String[] args) throws IOException {
for(int i = startID; i<=finishID;i++){
printMovieInfo(i);
System.out.println("----------------");
}
}
public static void printMovieInfo(int id) throws IOException{
HashMap<String,String> movieData = new HashMap<String, String>();
Document doc = Jsoup.connect(String.format("http://www.imdb.com/title/tt%07d/",id)).get();
Element name = doc.select("div.title_wrapper").select("h1").first();
Element year = name.select("a").first();
Element duration = doc.select("time").first();
Element director = doc.select("span[itemprop=\"director\"]>a>span[itemprop=\"name\"]").first();
if(name!=null)
movieData.put("name",name.ownText());
if(year!=null)
movieData.put("year",year.ownText());
if(duration!=null)
movieData.put("duration",duration.ownText());
if(director!=null)
movieData.put("director", director.ownText());
printMap(movieData);
}
private static void printMap(HashMap<String, String> hMap){
for(Map.Entry<String, String> mapEntry : hMap.entrySet()){
System.out.println(mapEntry.getKey()+": "+mapEntry.getValue());
}
}
}
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>home</groupId>
<artifactId>imdb_scrubber</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.8.3</version>
</dependency>
</dependencies>
</project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment