Created May 26, 2011 06:37
[Scala][Web] 伊莉文章的Parser
package rainy.maid.server.seeker
import java.util.ArrayList
import rainy.maid.server.domain.EneyPost
import scala.xml.NodeSeq
import scala.xml.{ XML, Node }
import de.hars.scalaxml._
import java.text.SimpleDateFormat
class eneySeeker(root: String, url: String) {
var nextPage = url
val normalThreadR = "normalthread_([0-9]{6,7})".r
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
//回傳資料型態圍List[EneyPost] 分別放著ID,文章名稱,文章超連結,最後修改日
def next() = {
val result = new TagSoupFactoryAdapter loadString (, "big5").mkString)
val tbodys = result \\ "tbody"
val subjects = new ArrayList[EneyPost]
for (tbody ← tbodys if (((tbody \ "@id").text.contains("normalthread"))))
subjects add (getSubject(tbody))
nextPage = root + getNextPage(result)
def getSubject(tbody: Node) =
val titleEl = (tbody \\ "span").head
val normalThreadR(id) = (tbody \ "@id").text
val newPost = new EneyPost
newPost setId (id.toLong)
newPost setTitleName (titleEl.text)
newPost setURL (root + (titleEl \ "a" \ "@href"))
newPost setLastModifyTime (dateFormat.parse(getLastModify(tbody)))
def getLastModify(tbody: Node) =
//取出Tbody底下的td Tag,並且此td Tag的Class屬性為author
(RichNodeSeq(tbody) \\ "td[@class==author]" \\ "em").text
def getNextPage(body: Node) =
//找出所有Div Tag屬性Class為Pages的節點,而此Div底下的所有超連結,我們只看class為Next那個
(RichNodeSeq(body) \\ "div[@class==pages]" \\ "a[@class==next]" \\ "@href").text
package test.rainy.maid.server.maidwork
import org.junit.After
import org.junit.Before
import org.junit.Test
import org.scalatest.junit.JUnitSuite
import scala.collection.mutable.ListBuffer
import rainy.maid.server.domain._
import rainy.maid.server.maidwork._
import scala.collection.JavaConversions._
class testEneySearchNewPost extends JUnitSuite{
val helper:LocalServiceTestHelper = new LocalServiceTestHelper(new LocalDatastoreServiceTestConfig());
def setUp()=
def tearDown()=
def testSearchNewPost()=
val workLog = new EneyWorkLog
workLog setRootPath ""
workLog setFullPath ""
workLog setLastPostID 5506759
val toDoList = new MaidToDoList with EneySearchNewPost
toDoList doWork(workLog)
val posts = EneyPostManager findPosts(false)
posts foreach(post=>println(post.getId,post.getTitleName))
