Skip to content

Instantly share code, notes, and snippets.

@jrodbx
Last active August 29, 2015 14:00
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save jrodbx/11202614 to your computer and use it in GitHub Desktop.
Google I/O 2014 YouTube video annotation easter egg crawler
import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.client.http.HttpRequest;
import com.google.api.client.http.HttpRequestInitializer;
import com.google.api.client.http.HttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.client.util.DateTime;
import com.google.api.services.youtube.YouTube;
import com.google.api.services.youtube.model.PageInfo;
import com.google.api.services.youtube.model.ResourceId;
import com.google.api.services.youtube.model.SearchListResponse;
import com.google.api.services.youtube.model.SearchResult;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
public class YouTubeCrawler {
public static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();
public static final JsonFactory JSON_FACTORY = new JacksonFactory();
private static final String PROPERTIES_FILENAME = "youtube.properties";
private static final long NUMBER_OF_VIDEOS_RETURNED = 50;
private static YouTube youtube;
private static String ANNOTATION_URI = "https://www.youtube.com/annotations_invideo?features=1&legacy=0&video_id=";
private static int counter = 0;
public static void main(String[] args) {
// YouTube api requests limit to 50 results per page, 500 results total
// therefore, if a query results in more than 500 results, we'll need to
// run multiple queries updating the publishedBefore search parameter each time
DateTime lastPublished = new DateTime("2014-04-18T20:53:24.000Z");
//DateTime lastPublished = new DateTime("2013-09-19T17:00:15.000Z");
//DateTime lastPublished = new DateTime("2013-05-18T19:00:27.000Z");
//DateTime lastPublished = new DateTime("2013-01-29T22:18:50.000Z");
//DateTime lastPublished = new DateTime("2012-06-21T21:32:19.000Z");
//DateTime lastPublished = new DateTime("2009-08-11T22:46:53.000Z");
//DateTime lastPublished = new DateTime("2007-12-21T13:18:39.000Z");
//DateTime lastPublished = new DateTime("2007-08-23T17:44:51.000Z");
Properties properties = new Properties();
try {
InputStream in = YouTube.Search.class.getResourceAsStream("/" + PROPERTIES_FILENAME);
properties.load(in);
} catch (IOException e) {
System.err.println("There was an error reading " + PROPERTIES_FILENAME + ": " + e.getCause()
+ " : " + e.getMessage());
System.exit(1);
}
String apiKey = properties.getProperty("youtube.apikey");
String appName = properties.getProperty("app.name");
try {
youtube = new YouTube.Builder(HTTP_TRANSPORT, JSON_FACTORY, new HttpRequestInitializer() {
public void initialize(HttpRequest request) throws IOException {
return;
}
}).setApplicationName(appName).build();
String prevPageToken = null;
String nextPageToken = null;
while (true) {
YouTube.Search.List search = youtube.search().list("id,snippet");
// Set your developer key from the Google Developers Console for non-authenticated requests.
// See: https://cloud.google.com/console
search.setKey(apiKey);
search.setOrder("date");
search.setChannelId("UC_x5XG1OV2P6uZZ5FSM9Ttw");
search.setPageToken(nextPageToken);
search.setPublishedBefore(lastPublished);
search.setType("video");
//search.setFields("items(id/kind,id/videoId,snippet/title),nextPageToken,pageInfo(totalResults)");
search.setFields("items(id/kind,id/videoId,snippet/publishedAt),nextPageToken,pageInfo(totalResults)");
search.setMaxResults(NUMBER_OF_VIDEOS_RETURNED);
SearchListResponse searchResponse = search.execute();
PageInfo pageInfo = searchResponse.getPageInfo();
if (pageInfo != null && nextPageToken == null) {
int totalResults = pageInfo.getTotalResults();
System.out.println("\n=============================================================");
System.out.println(" " + totalResults + " videos for search on \"Google Developers\".");
System.out.println("=============================================================\n");
}
prevPageToken = nextPageToken;
nextPageToken = searchResponse.getNextPageToken();
//System.out.println("next page: " + nextPageToken);
List<SearchResult> searchResultList = searchResponse.getItems();
if (searchResultList != null) {
prettyPrint(searchResultList.iterator());
}
if (nextPageToken == null) {
DateTime finalPub = null;
if(searchResultList != null) {
finalPub = searchResultList.get(searchResultList.size()-1).getSnippet().getPublishedAt();
}
System.out.println("no more next page, pubAt = " + finalPub + ", counter = " + counter + ", prev = " + prevPageToken + ", next = " + nextPageToken);
break;
}
}
} catch (GoogleJsonResponseException e) {
System.err.println("There was a service error: " + e.getDetails().getCode() + " : " + e.getDetails().getMessage());
} catch (IOException e) {
System.err.println("There was an IO error: " + e.getCause() + " : " + e.getMessage());
} catch (Throwable t) {
t.printStackTrace();
}
}
private static void prettyPrint(Iterator<SearchResult> iteratorSearchResults) {
while (iteratorSearchResults.hasNext()) {
SearchResult singleVideo = iteratorSearchResults.next();
ResourceId rId = singleVideo.getId();
counter++;
if (rId.getKind().equals("youtube#video")) {
//System.out.println(" Title: " + singleVideo.getSnippet().getTitle());
printAnnotation(rId.getVideoId(), singleVideo.getSnippet().getPublishedAt());
//System.out.println(" Video Id: " + rId.getVideoId());
//System.out.println();
}
}
}
private static void printAnnotation(String videoId, DateTime publishedAt) {
try {
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document doc = dBuilder.parse(new URL(ANNOTATION_URI + videoId).openStream());
doc.getDocumentElement().normalize();
NodeList annotationsList = doc.getElementsByTagName("TEXT");
for (int i = 0; i < annotationsList.getLength(); i++) {
Node annotationNode = annotationsList.item(i);
String annotation = annotationNode.getTextContent();
if (annotation.contains("goo.gl")) {
System.out.println(annotation + " , " + publishedAt + " , " + counter);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment