Skip to content

Instantly share code, notes, and snippets.

@twairball
Created September 23, 2017 01:18
Show Gist options
  • Save twairball/2bd45f0f1dbfe3543a34193edfde2f1b to your computer and use it in GitHub Desktop.
Save twairball/2bd45f0f1dbfe3543a34193edfde2f1b to your computer and use it in GitHub Desktop.
WMT17 Zh-En corpus have different number of lines?
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 51,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import jieba\n",
"import nltk\n",
"import os\n",
"\n",
"\"\"\"\n",
"Notebook exploring the weird misalignment between Zh and En corpus. \n",
"Inspecting the files on unix console we expect 227,330 lines in both corpus. \n",
"However, looping through the file line-by-line in python we find:\n",
" EN: 227568\n",
" ZH: 227603 (diff: 35)\n",
"\n",
"\n",
"On console: \n",
"$ wc -l training/news-commentary-v12.zh-en.zh\n",
"227330 training/news-commentary-v12.zh-en.zh\n",
"\n",
"$ wc -l training/news-commentary-v12.zh-en.en\n",
"227330 training/news-commentary-v12.zh-en.en\n",
"\n",
"\n",
"WMT17 training Dataset corpus can be downloaded from:\n",
"http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz\n",
"\n",
"\n",
"\n",
"\"\"\"\n",
"zh_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.zh\"\n",
"en_filepath=\"tmp/wmt17_en_zh/training/news-commentary-v12.zh-en.en\""
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(227603, 227573)"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\" Counting using splitlines. somehow we get a different count for En. \"\"\"\n",
"def count_splitlines(filename):\n",
" return len(open(filename).read().splitlines())\n",
"\n",
"count_splitlines(zh_filepath), count_splitlines(en_filepath)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(227603, 227568)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\" readlines() matches what we typically expect when reading line-by-line from python. \"\"\"\n",
"def count_readlines(filename):\n",
" return len(open(filename).readlines())\n",
"\n",
"count_readlines(zh_filepath), count_readlines(en_filepath)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"\"\"\" Count lines that are blank. \"\"\"\n",
"def blank_line_count(filename):\n",
" with open(filename) as fd:\n",
" count = sum(1 for line in fd if len(line.strip()) == 0)\n",
" return count"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(39, 146)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"blank_line_count(zh_filepath), blank_line_count(en_filepath)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(0, 0)"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\" find occurences of weird line breaks? \\n, \\r, \\t, \\v\"\"\"\n",
"import re\n",
"def cr_count(filename, substr=\"\\r\"):\n",
" full = open(filename).read()\n",
" return len(re.findall(substr, full))\n",
"\n",
"cr_count(zh_filepath, \"\\r\"), cr_count(en_filepath, \"\\r\") "
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def report_blank_lines(filename):\n",
" tot = 0\n",
" with open(filename) as f:\n",
" for i, l in enumerate(f):\n",
" if len(l.strip()) < 1:\n",
" print(\"[%d] %s[END]\" % (i, l))\n",
" tot = tot + 1\n",
" print(\" total: %d\" % tot)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[27660] \n",
"[END]\n",
"[51225] \n",
"[END]\n",
"[66871] \n",
"[END]\n",
"[75770] \n",
"[END]\n",
"[75775] \n",
"[END]\n",
"[82330] \n",
"[END]\n",
"[89880]     \n",
"[END]\n",
"[91075] \n",
"[END]\n",
"[105145] \n",
"[END]\n",
"[119307] \n",
"[END]\n",
"[126515] \n",
"[END]\n",
"[128127] \n",
"[END]\n",
"[137127] \n",
"[END]\n",
"[137604] \n",
"[END]\n",
"[145516] \n",
"[END]\n",
"[146597] \n",
"[END]\n",
"[147274] \n",
"[END]\n",
"[151833] \n",
"[END]\n",
"[166718] \n",
"[END]\n",
"[167566] \n",
"[END]\n",
"[167574] \n",
"[END]\n",
"[167586] \n",
"[END]\n",
"[167591] \n",
"[END]\n",
"[167598] \n",
"[END]\n",
"[172120] \n",
"[END]\n",
"[176885] \n",
"[END]\n",
"[178064] \n",
"[END]\n",
"[178066] \n",
"[END]\n",
"[178643] \n",
"[END]\n",
"[178983] \n",
"[END]\n",
"[178985] \n",
"[END]\n",
"[179010] \n",
"[END]\n",
"[179817] \n",
"[END]\n",
"[180836] \n",
"[END]\n",
"[183407] \n",
"[END]\n",
"[190958] \n",
"[END]\n",
"[193349] \n",
"[END]\n",
"[197577] \n",
"[END]\n",
"[206624] \n",
"[END]\n",
" total: 39\n"
]
}
],
"source": [
"report_blank_lines(zh_filepath)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[4088] \n",
"[END]\n",
"[8112] \n",
"[END]\n",
"[13275] \n",
"[END]\n",
"[13276] \n",
"[END]\n",
"[13357] \n",
"[END]\n",
"[13358] \n",
"[END]\n",
"[13581] \n",
"[END]\n",
"[13582] \n",
"[END]\n",
"[13783] \n",
"[END]\n",
"[13784] \n",
"[END]\n",
"[14646] \n",
"[END]\n",
"[14740] \n",
"[END]\n",
"[15454] \n",
"[END]\n",
"[15514] \n",
"[END]\n",
"[15515] \n",
"[END]\n",
"[16244] \n",
"[END]\n",
"[20289] \n",
"[END]\n",
"[23012] \n",
"[END]\n",
"[24964] \n",
"[END]\n",
"[24965] \n",
"[END]\n",
"[27670] \n",
"[END]\n",
"[31465] \n",
"[END]\n",
"[31466] \n",
"[END]\n",
"[32799] \n",
"[END]\n",
"[35079] \n",
"[END]\n",
"[35080] \n",
"[END]\n",
"[37662] \n",
"[END]\n",
"[37663] \n",
"[END]\n",
"[39318] \n",
"[END]\n",
"[39321] \n",
"[END]\n",
"[45101] \n",
"[END]\n",
"[48448] \n",
"[END]\n",
"[48450] \n",
"[END]\n",
"[48451] \n",
"[END]\n",
"[48454] \n",
"[END]\n",
"[49221] \n",
"[END]\n",
"[49222] \n",
"[END]\n",
"[51263] \n",
"[END]\n",
"[55062] \n",
"[END]\n",
"[64458] \n",
"[END]\n",
"[66912] \n",
"[END]\n",
"[70704] \n",
"[END]\n",
"[74897] \n",
"[END]\n",
"[75821] \n",
"[END]\n",
"[75826] \n",
"[END]\n",
"[75975] \n",
"[END]\n",
"[79343] \n",
"[END]\n",
"[80313] \n",
"[END]\n",
"[80691] \n",
"[END]\n",
"[82383] \n",
"[END]\n",
"[82385] \n",
"[END]\n",
"[83542] \n",
"[END]\n",
"[85636] \n",
"[END]\n",
"[88839] \n",
"[END]\n",
"[91118] \n",
"[END]\n",
"[91831] \n",
"[END]\n",
"[91832] \n",
"[END]\n",
"[92274] \n",
"[END]\n",
"[98705] \n",
"[END]\n",
"[100113] \n",
"[END]\n",
"[102805] \n",
"[END]\n",
"[103524] \n",
"[END]\n",
"[103525] \n",
"[END]\n",
"[103531] \n",
"[END]\n",
"[103532] \n",
"[END]\n",
"[104059] \n",
"[END]\n",
"[105195] \n",
"[END]\n",
"[105196] \n",
"[END]\n",
"[105204] \n",
"[END]\n",
"[109823] \n",
"[END]\n",
"[112173] \n",
"[END]\n",
"[112174] \n",
"[END]\n",
"[112522] \n",
"[END]\n",
"[114362] \n",
"[END]\n",
"[116957] \n",
"[END]\n",
"[116958] \n",
"[END]\n",
"[119007] \n",
"[END]\n",
"[120015] \n",
"[END]\n",
"[121140] \n",
"[END]\n",
"[121142] \n",
"[END]\n",
"[121147] \n",
"[END]\n",
"[123323] \n",
"[END]\n",
"[123324] \n",
"[END]\n",
"[126575] \n",
"[END]\n",
"[127835] \n",
"[END]\n",
"[127836] \n",
"[END]\n",
"[128796] \n",
"[END]\n",
"[128803] \n",
"[END]\n",
"[133237] \n",
"[END]\n",
"[141777] \n",
"[END]\n",
"[142861] \n",
"[END]\n",
"[142895] \n",
"[END]\n",
"[144866] \n",
"[END]\n",
"[145908] \n",
"[END]\n",
"[146305] \n",
"[END]\n",
"[146306] \n",
"[END]\n",
"[146751] \n",
"[END]\n",
"[147268] \n",
"[END]\n",
"[147269] \n",
"[END]\n",
"[147881] \n",
"[END]\n",
"[151364] \n",
"[END]\n",
"[151905] \n",
"[END]\n",
"[156970] \n",
"[END]\n",
"[162701] \n",
"[END]\n",
"[164167] \n",
"[END]\n",
"[166196] \n",
"[END]\n",
"[166202] \n",
"[END]\n",
"[166791] \n",
"[END]\n",
"[167671] \n",
"[END]\n",
"[169043] \n",
"[END]\n",
"[169044] \n",
"[END]\n",
"[172187] \n",
"[END]\n",
"[172202] \n",
"[END]\n",
"[174471] \n",
"[END]\n",
"[174472] \n",
"[END]\n",
"[177674] \n",
"[END]\n",
"[178739] \n",
"[END]\n",
"[179035] \n",
"[END]\n",
"[179036] \n",
"[END]\n",
"[179379] \n",
"[END]\n",
"[179808] \n",
"[END]\n",
"[180869] \n",
"[END]\n",
"[182652] \n",
"[END]\n",
"[184332] \n",
"[END]\n",
"[184333] \n",
"[END]\n",
"[184483] \n",
"[END]\n",
"[184484] \n",
"[END]\n",
"[190377] \n",
"[END]\n",
"[191032] \n",
"[END]\n",
"[191407] \n",
"[END]\n",
"[192485] \n",
"[END]\n",
"[192486] \n",
"[END]\n",
"[195724] \n",
"[END]\n",
"[197527] \n",
"[END]\n",
"[199217] \n",
"[END]\n",
"[199218] \n",
"[END]\n",
"[199819] \n",
"[END]\n",
"[202672] \n",
"[END]\n",
"[211684] \n",
"[END]\n",
"[214254] \n",
"[END]\n",
"[216153] \n",
"[END]\n",
"[216416] \n",
"[END]\n",
"[216638] \n",
"[END]\n",
"[217317] \n",
"[END]\n",
"[221007] \n",
"[END]\n",
"[225697] \n",
"[END]\n",
" total: 146\n"
]
}
],
"source": [
"report_blank_lines(en_filepath)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment