Created
November 24, 2017 07:41
-
-
Save chenyahui/2f80a332e087b4258eff420857423ceb to your computer and use it in GitHub Desktop.
页面日期时间抽取
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.cyhone; | |
import java.text.SimpleDateFormat; | |
import java.util.Calendar; | |
import java.util.Date; | |
import java.util.concurrent.TimeUnit; | |
/** | |
* @author cyhone | |
* @date 2017/3/28 | |
*/ | |
public class DateUtils { | |
private static final String DATE_PATTEN = "yyyy-MM-dd"; | |
/** | |
* @param year 年,格式为2017 | |
* @param month 正常月份 如1为1月份,2为2月份 | |
* @param day 日期,如01、31等 | |
* @return 若时间正确,则返回Date对象 ,若错误则返回null | |
*/ | |
public static Date formatDate(Integer year, Integer month, Integer day) { | |
Calendar cal = Calendar.getInstance(); | |
cal.setLenient(false); | |
cal.set(year, month - 1, day); | |
try { | |
return cal.getTime(); | |
} catch (Exception e) { | |
return null; | |
} | |
} | |
public static Date formatDate(String year, String month, String day) { | |
return formatDate(Integer.valueOf(year), Integer.valueOf(month), Integer.valueOf(day)); | |
} | |
public static String toString(Date date) { | |
SimpleDateFormat formater = new SimpleDateFormat(DATE_PATTEN); | |
return formater.format(date); | |
} | |
public static int minusDate(Date date1, Date date2) { | |
if (date1 == null || date2 == null) | |
throw new IllegalArgumentException("date1 or date2 cannot be null"); | |
return (int) TimeUnit.DAYS.convert(date1.getTime() - date2.getTime(), TimeUnit.MILLISECONDS); | |
} | |
public static String thisYear() { | |
return String.valueOf(Calendar.getInstance().get(Calendar.YEAR)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.cyhone; | |
import java.util.ArrayList; | |
import java.util.Date; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* @author cyhone | |
* @date 2017/3/28 | |
*/ | |
public class PublishedDate { | |
private PublishedDate() { | |
} | |
public static Date parse(String url, String html) { | |
Date urlDate = parseFromUrl(url); | |
Date contentDate = parseFromHTML(html); | |
return chooseLastestDate(urlDate, contentDate); | |
} | |
private static List<Pattern> urlPatterns = new ArrayList<Pattern>(); | |
private static List<Pattern> contentPatterns = new ArrayList<Pattern>(); | |
private static final String YEAR_REGEX = "(?<year>20[01]\\d{1})"; //仅抽取200x或者201x年的日期 | |
private static final String MONTH_REGEX = "(?<month>(?:1[012])|(?:0?[1-9]))"; | |
private static final String DAY_REGEX = "(?<day>(?:[12][0-9])|(?:3[01])|(?:0?[1-9]))"; | |
static { | |
initUrlRegex(); | |
initContentRegex(); | |
} | |
/** | |
* 装配url正则表达式 | |
*/ | |
private static void initUrlRegex() { | |
/* | |
1. /201612/08/ | |
2. /201703/ | |
3. /20170321/ | |
4. /2017-2-28/ | |
5. /2016/1114/ | |
6. /2016-12-31-600479.html | |
*/ | |
String urlReg1 = "/" + YEAR_REGEX + "[/\\-]?" + MONTH_REGEX + "(?:[/\\-]?" + DAY_REGEX + ")?[/\\-]"; | |
/* | |
7. /3/6/2005/ | |
*/ | |
String urlReg2 = "/" + MONTH_REGEX + "[/\\-]?" + DAY_REGEX + "[/\\-]?" + YEAR_REGEX + "[/\\-]"; | |
urlPatterns.add(Pattern.compile(urlReg1)); | |
urlPatterns.add(Pattern.compile(urlReg2)); | |
} | |
public static Date parseFromUrl(String url) { | |
Date result = null; | |
for (Pattern pattern : urlPatterns) { | |
Matcher matcher = pattern.matcher(url); | |
String year = null, month = null, day = null; | |
if (matcher.find()) { | |
year = matcher.group("year"); | |
month = matcher.group("month"); | |
day = matcher.group("day"); | |
} | |
result = formatDate(year, month, day); | |
if (result != null) return result; | |
} | |
return null; | |
} | |
private static void initContentRegex() { | |
/* | |
1. 2016年11月23日 | |
2. 2016-12-20 | |
3. 2016/12/20 | |
4. 2016\12\20 | |
*/ | |
String contentReg1 = YEAR_REGEX + "\\s*[年/\\-_::\\\\]\\s*" | |
+ MONTH_REGEX + "\\s*[月/\\-_::\\\\]\\s*" | |
+ DAY_REGEX + "\\s*[日/\\-_::\\\\]?\\s*"; | |
/* | |
5. 10-16 08:23 | |
*/ | |
String contentReg2 = "(" + YEAR_REGEX + "\\s*[年/\\-_::\\\\]\\s*)?" | |
+ MONTH_REGEX + "\\s*[月/\\-_::\\\\]\\s*" | |
+ DAY_REGEX + "\\s*[日/\\-_::\\\\]?\\s*" | |
+ "[0-2]\\d[::][0-6]\\d"; | |
contentPatterns.add(Pattern.compile(contentReg1)); | |
contentPatterns.add(Pattern.compile(contentReg2)); | |
} | |
public static Date parseFromText(String content) { | |
Date result = null; | |
for (Pattern pattern : contentPatterns) { | |
Matcher matcher = pattern.matcher(content); | |
String year = null, month = null, day = null; | |
while (matcher.find()) { | |
month = matcher.group("month"); | |
day = matcher.group("day"); | |
year = matcher.group("year"); | |
year = year == null ? DateUtils.thisYear() : year; | |
result = chooseLastestDate(result, formatDate(year, month, day)); | |
} | |
} | |
return result; | |
} | |
/** | |
* 选择最晚的日期 | |
* | |
* @param date1 date | |
* @param date2 date | |
* @return | |
*/ | |
private static Date chooseLastestDate(Date date1, Date date2) { | |
if (date1 == null) | |
return date2; | |
if (date2 == null) | |
return date1; | |
return DateUtils.minusDate(date1, date2) > 0 ? date1 : date2; | |
} | |
public static Date parseFromHTML(String html) { | |
// 预处理,去除标签、script,仅留取body里面的数据 | |
html = html.replaceAll("(?is)<script.*?>.*?</script>", "") | |
.replaceAll("(?is)<head.*?>.*?</head>", ""); | |
return parseFromText(html); | |
} | |
private static Date formatDate(String year, String month, String day) { | |
if (day == null) | |
day = "01"; | |
if (year == null || month == null) | |
return null; | |
return DateUtils.formatDate(year, month, day); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment