Skip to content

Instantly share code, notes, and snippets.

@chenyahui
Created November 24, 2017 07:41
Show Gist options
  • Save chenyahui/2f80a332e087b4258eff420857423ceb to your computer and use it in GitHub Desktop.
Save chenyahui/2f80a332e087b4258eff420857423ceb to your computer and use it in GitHub Desktop.
页面日期时间抽取
package com.cyhone;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.concurrent.TimeUnit;
/**
* @author cyhone
* @date 2017/3/28
*/
public class DateUtils {
private static final String DATE_PATTEN = "yyyy-MM-dd";
/**
* @param year 年,格式为2017
* @param month 正常月份 如1为1月份,2为2月份
* @param day 日期,如01、31等
* @return 若时间正确,则返回Date对象 ,若错误则返回null
*/
public static Date formatDate(Integer year, Integer month, Integer day) {
Calendar cal = Calendar.getInstance();
cal.setLenient(false);
cal.set(year, month - 1, day);
try {
return cal.getTime();
} catch (Exception e) {
return null;
}
}
public static Date formatDate(String year, String month, String day) {
return formatDate(Integer.valueOf(year), Integer.valueOf(month), Integer.valueOf(day));
}
public static String toString(Date date) {
SimpleDateFormat formater = new SimpleDateFormat(DATE_PATTEN);
return formater.format(date);
}
public static int minusDate(Date date1, Date date2) {
if (date1 == null || date2 == null)
throw new IllegalArgumentException("date1 or date2 cannot be null");
return (int) TimeUnit.DAYS.convert(date1.getTime() - date2.getTime(), TimeUnit.MILLISECONDS);
}
public static String thisYear() {
return String.valueOf(Calendar.getInstance().get(Calendar.YEAR));
}
}
package com.cyhone;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author cyhone
* @date 2017/3/28
*/
public class PublishedDate {
private PublishedDate() {
}
public static Date parse(String url, String html) {
Date urlDate = parseFromUrl(url);
Date contentDate = parseFromHTML(html);
return chooseLastestDate(urlDate, contentDate);
}
private static List<Pattern> urlPatterns = new ArrayList<Pattern>();
private static List<Pattern> contentPatterns = new ArrayList<Pattern>();
private static final String YEAR_REGEX = "(?<year>20[01]\\d{1})"; //仅抽取200x或者201x年的日期
private static final String MONTH_REGEX = "(?<month>(?:1[012])|(?:0?[1-9]))";
private static final String DAY_REGEX = "(?<day>(?:[12][0-9])|(?:3[01])|(?:0?[1-9]))";
static {
initUrlRegex();
initContentRegex();
}
/**
* 装配url正则表达式
*/
private static void initUrlRegex() {
/*
1. /201612/08/
2. /201703/
3. /20170321/
4. /2017-2-28/
5. /2016/1114/
6. /2016-12-31-600479.html
*/
String urlReg1 = "/" + YEAR_REGEX + "[/\\-]?" + MONTH_REGEX + "(?:[/\\-]?" + DAY_REGEX + ")?[/\\-]";
/*
7. /3/6/2005/
*/
String urlReg2 = "/" + MONTH_REGEX + "[/\\-]?" + DAY_REGEX + "[/\\-]?" + YEAR_REGEX + "[/\\-]";
urlPatterns.add(Pattern.compile(urlReg1));
urlPatterns.add(Pattern.compile(urlReg2));
}
public static Date parseFromUrl(String url) {
Date result = null;
for (Pattern pattern : urlPatterns) {
Matcher matcher = pattern.matcher(url);
String year = null, month = null, day = null;
if (matcher.find()) {
year = matcher.group("year");
month = matcher.group("month");
day = matcher.group("day");
}
result = formatDate(year, month, day);
if (result != null) return result;
}
return null;
}
private static void initContentRegex() {
/*
1. 2016年11月23日
2. 2016-12-20
3. 2016/12/20
4. 2016\12\20
*/
String contentReg1 = YEAR_REGEX + "\\s*[年/\\-_::\\\\]\\s*"
+ MONTH_REGEX + "\\s*[月/\\-_::\\\\]\\s*"
+ DAY_REGEX + "\\s*[日/\\-_::\\\\]?\\s*";
/*
5. 10-16 08:23
*/
String contentReg2 = "(" + YEAR_REGEX + "\\s*[年/\\-_::\\\\]\\s*)?"
+ MONTH_REGEX + "\\s*[月/\\-_::\\\\]\\s*"
+ DAY_REGEX + "\\s*[日/\\-_::\\\\]?\\s*"
+ "[0-2]\\d[::][0-6]\\d";
contentPatterns.add(Pattern.compile(contentReg1));
contentPatterns.add(Pattern.compile(contentReg2));
}
public static Date parseFromText(String content) {
Date result = null;
for (Pattern pattern : contentPatterns) {
Matcher matcher = pattern.matcher(content);
String year = null, month = null, day = null;
while (matcher.find()) {
month = matcher.group("month");
day = matcher.group("day");
year = matcher.group("year");
year = year == null ? DateUtils.thisYear() : year;
result = chooseLastestDate(result, formatDate(year, month, day));
}
}
return result;
}
/**
* 选择最晚的日期
*
* @param date1 date
* @param date2 date
* @return
*/
private static Date chooseLastestDate(Date date1, Date date2) {
if (date1 == null)
return date2;
if (date2 == null)
return date1;
return DateUtils.minusDate(date1, date2) > 0 ? date1 : date2;
}
public static Date parseFromHTML(String html) {
// 预处理,去除标签、script,仅留取body里面的数据
html = html.replaceAll("(?is)<script.*?>.*?</script>", "")
.replaceAll("(?is)<head.*?>.*?</head>", "");
return parseFromText(html);
}
private static Date formatDate(String year, String month, String day) {
if (day == null)
day = "01";
if (year == null || month == null)
return null;
return DateUtils.formatDate(year, month, day);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment