Skip to content

Instantly share code, notes, and snippets.

@Carolusian
Created July 31, 2012 13:43
Show Gist options
  • Save Carolusian/3217139 to your computer and use it in GitHub Desktop.
Save Carolusian/3217139 to your computer and use it in GitHub Desktop.
Export wordbook from remword to youdao dict
import org.eclipse.swt.SWT
import org.eclipse.swt.widgets.{Shell, Display, ToolItem, ToolBar, Listener, Event}
import org.eclipse.swt.browser.Browser
import org.eclipse.swt.layout._
import scala.util.matching.Regex
import scala.xml._
import org.w3c.tidy.Tidy
import java.io._
object Remword2YouDao {
val btnText = "Click Here to Export to Youdao Format.."
val btnExport = "Export"
val wordbookUrl = """^http://online.remword.cn/review/([0-9a-zA-Z]+)/$""".r
val list = new scala.collection.mutable.ListBuffer[String]()
def main(args:Array[String]) {
val grid = new GridLayout
grid.numColumns = 1
val display = new Display
val shell = new Shell(display)
shell.setLayout(grid)
shell.setText("Export remword's wordbook to youdao format...")
shell.setMaximized( true )
val toolbar = new ToolBar(shell, SWT.NONE);
toolbar.setBounds( 5, 5, 350, 30);
val item = new ToolItem(toolbar, SWT.PUSH)
val exportItem = new ToolItem(toolbar, SWT.PUSH)
item.setText(btnText)
exportItem.setText(btnExport)
val browserGridData = new GridData
browserGridData.verticalAlignment = GridData.FILL;
browserGridData.horizontalAlignment = GridData.FILL
browserGridData.grabExcessHorizontalSpace = true;
browserGridData.grabExcessVerticalSpace = true;
val browser = new Browser(shell, SWT.NONE)
browser.setLayoutData(browserGridData)
shell.open()
val listener = new Listener() {
def handleEvent(event:Event) {
val words = browser.getUrl match {
case wordbookUrl(random) => extract(browser.getUrl, browser)
case _ => Nil
}
}
}
val listener2 = new Listener() {
def handleEvent(event:Event) {
export(list.toList)
}
}
item.addListener(SWT.Selection, listener)
exportItem.addListener(SWT.Selection, listener2)
browser.setUrl("http://www.remword.cn/index.php")
while (!shell.isDisposed ) {
if ( !display.readAndDispatch )
display.sleep
}
display.dispose
}
private def extract(page:String, browser:Browser):List[String] = {
//extract words from it
val words = getWordsFromHtml(browser.getText())
browser setUrl getNextPageLink(browser.getText())
println(words.toList)
export(words.toList)
words
}
private def printToFile(f: java.io.FileWriter)(op: java.io.FileWriter => Unit) {
try {op(f)} finally {f.close}
}
private def getWordsFromHtml(html:String) = {
val startPos = html.indexOf("<TBODY>")
val endPos = html.indexOf("</TBODY>")
val table = html.substring(startPos, endPos + 8)
val purified = this purify table
val xml = XML loadString purified
val allTD = xml \\ "td"
val wordsTD = allTD.filter(n => (n \ "@class").text.indexOf("fs14 c33 bname fa") > 0)
val words = wordsTD map (_.text)
words.toList
}
private def getNextPageLink(html:String) = {
val npRegx = """\/review\/([a-z0-9]+)\/""".r
val pages = for( random <- npRegx findAllIn html) yield random
"http://online.remword.cn" + pages.toList.reverse.head
}
private def purify(html:String) = {
val tidy = new Tidy
tidy.setInputEncoding("UTF-8")
tidy.setOutputEncoding("UTF-8")
tidy.setWraplen(Integer.MAX_VALUE)
tidy.setPrintBodyOnly(true)
tidy.setXmlOut(true)
tidy.setSmartIndent(true)
val inputStream = new ByteArrayInputStream(html.getBytes("UTF-8"));
val outputStream = new ByteArrayOutputStream();
tidy.parseDOM(inputStream, outputStream);
outputStream.toString("UTF-8")
}
private def export(words:List[String]) {
val wordbookxml = for(w <- words) yield "<item><word>"+w+"</word></item>"
printToFile(new java.io.FileWriter("wordbook.xml",true)) { p =>
p.write(wordbookxml.toString)
p.write("\n")
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment