Created
March 7, 2014 11:27
-
-
Save joseporiol/9409883 to your computer and use it in GitHub Desktop.
Extract title from WebPage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.nio.charset.Charset; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class TitleExtractor { | |
/* the CASE_INSENSITIVE flag accounts for | |
* sites that use uppercase title tags. | |
* the DOTALL flag accounts for sites that have | |
* line feeds in the title text */ | |
private static final Pattern TITLE_TAG = | |
Pattern.compile("\\<title>(.*)\\</title>", Pattern.CASE_INSENSITIVE|Pattern.DOTALL); | |
/** | |
* @param url the HTML page | |
* @return title text (null if document isn't HTML or lacks a title tag) | |
* @throws IOException | |
*/ | |
public static String getPageTitle(String url) throws IOException { | |
URL u = new URL(url); | |
URLConnection conn = u.openConnection(); | |
// ContentType is an inner class defined below | |
ContentType contentType = getContentTypeHeader(conn); | |
if (!contentType.contentType.equals("text/html")) | |
return null; // don't continue if not HTML | |
else { | |
// determine the charset, or use the default | |
Charset charset = getCharset(contentType); | |
if (charset == null) | |
charset = Charset.defaultCharset(); | |
// read the response body, using BufferedReader for performance | |
InputStream in = conn.getInputStream(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader(in, charset)); | |
int n = 0, totalRead = 0; | |
char[] buf = new char[1024]; | |
StringBuilder content = new StringBuilder(); | |
// read until EOF or first 8192 characters | |
while (totalRead < 8192 && (n = reader.read(buf, 0, buf.length)) != -1) { | |
content.append(buf, 0, n); | |
totalRead += n; | |
} | |
reader.close(); | |
// extract the title | |
Matcher matcher = TITLE_TAG.matcher(content); | |
if (matcher.find()) { | |
/* replace any occurrences of whitespace (which may | |
* include line feeds and other uglies) as well | |
* as HTML brackets with a space */ | |
return matcher.group(1).replaceAll("[\\s\\<>]+", " ").trim(); | |
} | |
else | |
return null; | |
} | |
} | |
/** | |
* Loops through response headers until Content-Type is found. | |
* @param conn | |
* @return ContentType object representing the value of | |
* the Content-Type header | |
*/ | |
private static ContentType getContentTypeHeader(URLConnection conn) { | |
int i = 0; | |
boolean moreHeaders = true; | |
do { | |
String headerName = conn.getHeaderFieldKey(i); | |
String headerValue = conn.getHeaderField(i); | |
if (headerName != null && headerName.equals("Content-Type")) | |
return new ContentType(headerValue); | |
i++; | |
moreHeaders = headerName != null || headerValue != null; | |
} | |
while (moreHeaders); | |
return null; | |
} | |
private static Charset getCharset(ContentType contentType) { | |
if (contentType != null && contentType.charsetName != null && Charset.isSupported(contentType.charsetName)) | |
return Charset.forName(contentType.charsetName); | |
else | |
return null; | |
} | |
/** | |
* Class holds the content type and charset (if present) | |
*/ | |
private static final class ContentType { | |
private static final Pattern CHARSET_HEADER = Pattern.compile("charset=([-_a-zA-Z0-9]+)", Pattern.CASE_INSENSITIVE|Pattern.DOTALL); | |
private String contentType; | |
private String charsetName; | |
private ContentType(String headerValue) { | |
if (headerValue == null) | |
throw new IllegalArgumentException("ContentType must be constructed with a not-null headerValue"); | |
int n = headerValue.indexOf(";"); | |
if (n != -1) { | |
contentType = headerValue.substring(0, n); | |
Matcher matcher = CHARSET_HEADER.matcher(headerValue); | |
if (matcher.find()) | |
charsetName = matcher.group(1); | |
} | |
else | |
contentType = headerValue; | |
} | |
} | |
} |
Works like a charm. Thanks!
This is the same as just using JSoup to do: Jsoup.connect(URL).get().title();
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to run this , where is the main method for it ?