Last active
March 31, 2023 08:27
-
-
Save Daenyth/4742267 to your computer and use it in GitHub Desktop.
Java class to extract an image from an html page using a method similar to Google+'s
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.IOException; | |
import java.net.URL; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
/** | |
* Given a url to a web page, extract a suitable image from that page. This will | |
* attempt to follow a method similar to Google+, as described <a href= | |
* "http://webmasters.stackexchange.com/questions/25581/how-does-google-plus-select-an-image-from-a-shared-link" | |
* >here</a> | |
* | |
*/ | |
public class ImageExtractor { | |
// TODO: Add junit test case for this. (Construct Document from string, extract, check) | |
public static String extractImageUrl(String url) throws IOException { | |
String contentType = new URL(url).openConnection().getContentType(); | |
if (contentType != null) { | |
if (contentType.startsWith("image/")) { | |
return url; | |
} | |
} | |
Document document = Jsoup.connect(url).get(); | |
String imageUrl = null; | |
imageUrl = getImageFromSchema(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
imageUrl = getImageFromOpenGraph(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
imageUrl = getImageFromTwitterCard(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
imageUrl = getImageFromTwitterShared(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
imageUrl = getImageFromLinkRel(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
imageUrl = getImageFromGuess(document); | |
if (imageUrl != null) { | |
return imageUrl; | |
} | |
return imageUrl; | |
} | |
private static String getImageFromTwitterShared(Document document) { | |
Element div = document.select("div.media-gallery-image-wrapper").first(); | |
if (div == null) { | |
return null; | |
} | |
Element img = div.select("img.media-slideshow-image").first(); | |
if (img != null) { | |
return img.absUrl("src"); | |
} | |
return null; | |
} | |
private static String getImageFromGuess(Document document) { | |
// TODO | |
return null; | |
} | |
private static String getImageFromLinkRel(Document document) { | |
Element link = document.select("link[rel=image_src]").first(); | |
if (link != null) { | |
return link.attr("abs:href"); | |
} | |
return null; | |
} | |
private static String getImageFromTwitterCard(Document document) { | |
Element meta = document.select("meta[name=twitter:card][content=photo]").first(); | |
if (meta == null) { | |
return null; | |
} | |
Element image = document.select("meta[name=twitter:image]").first(); | |
return image.attr("abs:content"); | |
} | |
private static String getImageFromOpenGraph(Document document) { | |
Element image = document.select("meta[property=og:image]").first(); | |
if (image != null) { | |
return image.attr("abs:content"); | |
} | |
Element secureImage = document.select("meta[property=og:image:secure]").first(); | |
if (secureImage != null) { | |
return secureImage.attr("abs:content"); | |
} | |
return null; | |
} | |
private static String getImageFromSchema(Document document) { | |
Element container = | |
document.select("*[itemscope][itemtype=http://schema.org/ImageObject]").first(); | |
if (container == null) { | |
return null; | |
} | |
Element image = container.select("img[itemprop=contentUrl]").first(); | |
if (image == null) { | |
return null; | |
} | |
return image.absUrl("src"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks very much .This is solved my problem