Created
July 31, 2012 13:43
-
-
Save Carolusian/3217139 to your computer and use it in GitHub Desktop.
Export wordbook from remword to youdao dict
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.eclipse.swt.SWT | |
import org.eclipse.swt.widgets.{Shell, Display, ToolItem, ToolBar, Listener, Event} | |
import org.eclipse.swt.browser.Browser | |
import org.eclipse.swt.layout._ | |
import scala.util.matching.Regex | |
import scala.xml._ | |
import org.w3c.tidy.Tidy | |
import java.io._ | |
object Remword2YouDao { | |
val btnText = "Click Here to Export to Youdao Format.." | |
val btnExport = "Export" | |
val wordbookUrl = """^http://online.remword.cn/review/([0-9a-zA-Z]+)/$""".r | |
val list = new scala.collection.mutable.ListBuffer[String]() | |
def main(args:Array[String]) { | |
val grid = new GridLayout | |
grid.numColumns = 1 | |
val display = new Display | |
val shell = new Shell(display) | |
shell.setLayout(grid) | |
shell.setText("Export remword's wordbook to youdao format...") | |
shell.setMaximized( true ) | |
val toolbar = new ToolBar(shell, SWT.NONE); | |
toolbar.setBounds( 5, 5, 350, 30); | |
val item = new ToolItem(toolbar, SWT.PUSH) | |
val exportItem = new ToolItem(toolbar, SWT.PUSH) | |
item.setText(btnText) | |
exportItem.setText(btnExport) | |
val browserGridData = new GridData | |
browserGridData.verticalAlignment = GridData.FILL; | |
browserGridData.horizontalAlignment = GridData.FILL | |
browserGridData.grabExcessHorizontalSpace = true; | |
browserGridData.grabExcessVerticalSpace = true; | |
val browser = new Browser(shell, SWT.NONE) | |
browser.setLayoutData(browserGridData) | |
shell.open() | |
val listener = new Listener() { | |
def handleEvent(event:Event) { | |
val words = browser.getUrl match { | |
case wordbookUrl(random) => extract(browser.getUrl, browser) | |
case _ => Nil | |
} | |
} | |
} | |
val listener2 = new Listener() { | |
def handleEvent(event:Event) { | |
export(list.toList) | |
} | |
} | |
item.addListener(SWT.Selection, listener) | |
exportItem.addListener(SWT.Selection, listener2) | |
browser.setUrl("http://www.remword.cn/index.php") | |
while (!shell.isDisposed ) { | |
if ( !display.readAndDispatch ) | |
display.sleep | |
} | |
display.dispose | |
} | |
private def extract(page:String, browser:Browser):List[String] = { | |
//extract words from it | |
val words = getWordsFromHtml(browser.getText()) | |
browser setUrl getNextPageLink(browser.getText()) | |
println(words.toList) | |
export(words.toList) | |
words | |
} | |
private def printToFile(f: java.io.FileWriter)(op: java.io.FileWriter => Unit) { | |
try {op(f)} finally {f.close} | |
} | |
private def getWordsFromHtml(html:String) = { | |
val startPos = html.indexOf("<TBODY>") | |
val endPos = html.indexOf("</TBODY>") | |
val table = html.substring(startPos, endPos + 8) | |
val purified = this purify table | |
val xml = XML loadString purified | |
val allTD = xml \\ "td" | |
val wordsTD = allTD.filter(n => (n \ "@class").text.indexOf("fs14 c33 bname fa") > 0) | |
val words = wordsTD map (_.text) | |
words.toList | |
} | |
private def getNextPageLink(html:String) = { | |
val npRegx = """\/review\/([a-z0-9]+)\/""".r | |
val pages = for( random <- npRegx findAllIn html) yield random | |
"http://online.remword.cn" + pages.toList.reverse.head | |
} | |
private def purify(html:String) = { | |
val tidy = new Tidy | |
tidy.setInputEncoding("UTF-8") | |
tidy.setOutputEncoding("UTF-8") | |
tidy.setWraplen(Integer.MAX_VALUE) | |
tidy.setPrintBodyOnly(true) | |
tidy.setXmlOut(true) | |
tidy.setSmartIndent(true) | |
val inputStream = new ByteArrayInputStream(html.getBytes("UTF-8")); | |
val outputStream = new ByteArrayOutputStream(); | |
tidy.parseDOM(inputStream, outputStream); | |
outputStream.toString("UTF-8") | |
} | |
private def export(words:List[String]) { | |
val wordbookxml = for(w <- words) yield "<item><word>"+w+"</word></item>" | |
printToFile(new java.io.FileWriter("wordbook.xml",true)) { p => | |
p.write(wordbookxml.toString) | |
p.write("\n") | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment