Skip to content

Instantly share code, notes, and snippets.

@Daenyth
Last active March 31, 2023 08:27
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save Daenyth/4742267 to your computer and use it in GitHub Desktop.
Save Daenyth/4742267 to your computer and use it in GitHub Desktop.
Java class to extract an image from an html page using a method similar to Google+'s
import java.io.IOException;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
/**
* Given a url to a web page, extract a suitable image from that page. This will
* attempt to follow a method similar to Google+, as described <a href=
* "http://webmasters.stackexchange.com/questions/25581/how-does-google-plus-select-an-image-from-a-shared-link"
* >here</a>
*
*/
public class ImageExtractor {
// TODO: Add junit test case for this. (Construct Document from string, extract, check)
public static String extractImageUrl(String url) throws IOException {
String contentType = new URL(url).openConnection().getContentType();
if (contentType != null) {
if (contentType.startsWith("image/")) {
return url;
}
}
Document document = Jsoup.connect(url).get();
String imageUrl = null;
imageUrl = getImageFromSchema(document);
if (imageUrl != null) {
return imageUrl;
}
imageUrl = getImageFromOpenGraph(document);
if (imageUrl != null) {
return imageUrl;
}
imageUrl = getImageFromTwitterCard(document);
if (imageUrl != null) {
return imageUrl;
}
imageUrl = getImageFromTwitterShared(document);
if (imageUrl != null) {
return imageUrl;
}
imageUrl = getImageFromLinkRel(document);
if (imageUrl != null) {
return imageUrl;
}
imageUrl = getImageFromGuess(document);
if (imageUrl != null) {
return imageUrl;
}
return imageUrl;
}
private static String getImageFromTwitterShared(Document document) {
Element div = document.select("div.media-gallery-image-wrapper").first();
if (div == null) {
return null;
}
Element img = div.select("img.media-slideshow-image").first();
if (img != null) {
return img.absUrl("src");
}
return null;
}
private static String getImageFromGuess(Document document) {
// TODO
return null;
}
private static String getImageFromLinkRel(Document document) {
Element link = document.select("link[rel=image_src]").first();
if (link != null) {
return link.attr("abs:href");
}
return null;
}
private static String getImageFromTwitterCard(Document document) {
Element meta = document.select("meta[name=twitter:card][content=photo]").first();
if (meta == null) {
return null;
}
Element image = document.select("meta[name=twitter:image]").first();
return image.attr("abs:content");
}
private static String getImageFromOpenGraph(Document document) {
Element image = document.select("meta[property=og:image]").first();
if (image != null) {
return image.attr("abs:content");
}
Element secureImage = document.select("meta[property=og:image:secure]").first();
if (secureImage != null) {
return secureImage.attr("abs:content");
}
return null;
}
private static String getImageFromSchema(Document document) {
Element container =
document.select("*[itemscope][itemtype=http://schema.org/ImageObject]").first();
if (container == null) {
return null;
}
Element image = container.select("img[itemprop=contentUrl]").first();
if (image == null) {
return null;
}
return image.absUrl("src");
}
}
@feliperazeek
Copy link

this is great thank you for sharing!

@husnakablan
Copy link

Thanks very much .This is solved my problem

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment