Skip to content

Instantly share code, notes, and snippets.

@sergey-tihon
Created September 30, 2019 07:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergey-tihon/6d0aba1c8653cdd0e6f95b67121d859b to your computer and use it in GitHub Desktop.
Save sergey-tihon/6d0aba1c8653cdd0e6f95b67121d859b to your computer and use it in GitHub Desktop.
Fast file parser
the -0.071549 0.093459 0.023738 -0.090339 0.056123 0.32547 -0.39796 -0.092139 0.061181 -0.1895 0.13061 0.14349 0.011479 0.38158 0.5403 -0.14088 0.24315 0.23036 -0.55339 0.048154 0.45662 3.2338 0.020199 0.049019 -0.014132 0.076017 -0.11527 0.2006 -0.077657 0.24328 0.16368 -0.34118 -0.06607 0.10152 0.038232 -0.17668 -0.88153 -0.33895 -0.035481 -0.55095 -0.016899 -0.43982 0.039004 0.40447 -0.2588 0.64594 0.26641 0.28009 -0.024625 0.63302 -0.317 0.10271 0.30886 0.097792 -0.38227 0.086552 0.047075 0.23511 -0.32127 -0.28538 0.1667 -0.0049707 -0.62714 -0.24904 0.29713 0.14379 -0.12325 -0.058178 -0.001029 -0.082126 0.36935 -0.00058442 0.34286 0.28426 -0.068599 0.65747 -0.029087 0.16184 0.073672 -0.30343 0.095733 -0.5286 -0.22898 0.064079 0.015218 0.34921 -0.4396 -0.43983 0.77515 -0.87767 -0.087504 0.39598 0.62362 -0.26211 -0.30539 -0.022964 0.30567 0.06766 0.15383 -0.11211 -0.09154 0.082562 0.16897 -0.032952 -0.28775 -0.2232 -0.090426 1.2407 -0.18244 -0.0075219 -0.041388 -0.011083 0.078186 0.38511 0.23334 0.14414 -0.0009107 -0.26388 -0.20481 0.10099 0.14076 0.28834 -0.045429 0.37247 0.13645 -0.67457 0.22786 0.12599 0.029091 0.030428 -0.13028 0.19408 0.49014 -0.39121 -0.075952 0.074731 0.18902 -0.16922 -0.26019 -0.039771 -0.24153 0.10875 0.30434 0.036009 1.4264 0.12759 -0.073811 -0.20418 0.0080016 0.15381 0.20223 0.28274 0.096206 -0.33634 0.50983 0.32625 -0.26535 0.374 -0.30388 -0.40033 -0.04291 -0.067897 -0.29332 0.10978 -0.045365 0.23222 -0.31134 -0.28983 -0.66687 0.53097 0.19461 0.3667 0.26185 -0.65187 0.10266 0.11363 -0.12953 -0.68246 -0.18751 0.1476 1.0765 -0.22908 -0.0093435 -0.20651 -0.35225 -0.2672 -0.0034307 0.25906 0.21759 0.66158 0.1218 0.19957 -0.20303 0.34474 -0.24328 0.13139 -0.0088767 0.33617 0.030591 0.25577
, 0.17651 0.29208 -0.0020768 -0.37523 0.0049139 0.23979 -0.28893 -0.014643 -0.10993 0.15592 0.20627 0.47675 0.099907 -0.14058 0.21114 0.12126 -0.31831 -0.089433 -0.090553 -0.31962 0.21319 2.4844 -0.077521 -0.084279 0.20186 0.26084 -0.40411 -0.19127 0.24715 0.22394 -0.063437 0.20379 -0.18463 -0.088413 0.024169 -0.28769 -0.61246 -0.12683 -0.088273 0.18331 -0.53161 -0.1997 -0.26703 0.15312 -0.015239 -0.082844 0.47856 -0.29612 0.11168 -0.02579 -0.011697 0.19923 -0.14267 0.6625 -0.051739 -0.16938 -0.15635 0.092806 0.32548 0.11724 0.28788 -0.060651 -0.14153 0.16668 0.26861 -0.031001 -0.39665 0.35304 0.2385 0.12388 0.45698 -0.12559 -0.12804 0.37449 0.2446 0.23073 0.20808 0.051258 -0.21816 -0.036409 -0.0388 -0.042487 -0.30779 -0.025449 0.22532 0.045538 -0.48934 -0.13988 0.17394 -0.46137 -0.26555 0.15473 0.063816 -0.17022 -0.15762 0.075765 0.12151 -0.4934 -0.10909 0.034487 0.29947 0.01869 -0.16534 0.016679 0.16341 -0.27418 0.077797 1.4023 0.025275 0.094725 -0.040735 -0.10642 0.023364 0.079143 -0.16615 -0.23013 -0.14071 0.40159 -0.34951 0.018545 0.22434 0.76922 0.24722 0.14936 0.42368 -0.72059 -0.038541 0.15522 0.33596 -0.43077 -0.026925 -0.37733 0.24271 -0.46495 0.45783 0.23693 0.079361 -0.32244 -0.42434 -0.11138 0.55426 0.085153 -0.020581 -0.046386 1.2467 0.13177 0.067092 -0.5778 0.013586 -0.071274 0.017311 0.089781 0.19857 -0.032205 0.64843 -0.23797 -0.19676 0.20203 0.21074 -0.50347 0.026823 -0.045444 -0.22642 -0.19977 -0.12138 0.16941 0.061998 0.42631 -0.088383 0.45756 0.077774 0.061342 0.4571 -0.17787 -0.14597 0.32654 0.002443 -0.11886 0.10081 -0.020011 1.0366 -0.39814 -0.6818 0.23685 -0.20396 -0.17668 -0.31385 0.14834 -0.052187 0.0613 -0.32582 0.19153 -0.15469 -0.14679 0.046971 0.032325 -0.22006 -0.20774 -0.23189 -0.10814
. 0.12289 0.58037 -0.069635 -0.50288 0.10503 0.39945 -0.38635 -0.084279 0.12219 0.080312 0.32337 0.47579 -0.038375 -0.00709 0.41524 0.32121 -0.21185 0.36144 -0.055623 -0.030512 0.42854 2.8547 -0.14623 -0.17557 0.31197 -0.13118 0.033298 0.13093 0.089889 -0.12417 0.0023396 -0.068954 -0.10754 -0.11551 -0.31052 -0.12097 -0.46691 -0.0836 -0.037664 -0.071779 -0.11899 -0.20381 -0.12424 0.46339 -0.19828 -0.0080365 0.53718 0.031739 0.34331 0.0079704 0.0048744 0.030592 -0.17615 0.82342 -0.13793 -0.10075 -0.12686 0.074735 -0.088719 -0.042719 0.076624 0.089263 0.064445 -0.031958 0.15254 -0.10384 0.076604 0.34099 0.24331 -0.10452 0.40714 -0.1826 -0.040667 0.50878 0.08076 0.22759 -0.042162 -0.18171 -0.095025 0.030334 0.088202 -3.9843e-06 -0.0039877 0.15724 0.33167 0.08471 -0.25919 -0.41384 0.2992 -0.54255 0.032129 0.1003 0.44202 0.044682 -0.090681 -0.10481 -0.1186 -0.31972 -0.2079 -0.040203 -0.022988 0.22824 0.0055238 0.12568 -0.1464 -0.14904 -0.11561 1.0517 -0.19498 0.083958 0.044812 -0.12965 -0.093468 0.21237 -0.088332 -0.1868 0.26521 0.13097 -0.048102 -0.22467 0.28412 0.34907 0.34833 0.017877 0.30504 -0.83453 0.048856 -0.1933 0.20764 -0.49701 -0.18747 -0.076801 0.15558 -0.46844 0.40944 0.21386 0.082392 -0.26491 -0.21224 -0.13293 0.14738 -0.14192 0.18994 -0.15587 1.0738 0.40789 -0.27452 -0.18431 0.00068679 -0.087115 0.19672 0.40918 -0.35462 -0.06326 0.4492 -0.060568 -0.041636 0.20531 0.017025 -0.58448 0.075441 0.082116 -0.46008 0.012393 -0.02531 0.14177 -0.092192 0.34505 -0.52136 0.57304 0.011973 0.033196 0.29672 -0.27899 0.19979 0.25666 0.082079 -0.078436 0.093719 0.24202 1.3495 -0.30434 -0.30936 0.42047 -0.079068 -0.14819 -0.089404 0.0668 0.22405 0.27226 -0.035236 0.17688 -0.0536 0.0070031 -0.033006 -0.080021 -0.24451 -0.039174 -0.16236 -0.096652
of 0.052924 0.25427 0.31353 -0.35613 0.029629 0.51034 -0.10716 0.15195 0.057698 0.06149 0.06116 0.39911 -0.00029018 0.31978 0.43257 -0.14708 0.054842 0.27079 -0.14051 -0.30101 0.16313 3.0013 0.22231 -0.14279 0.083705 0.089866 -0.52706 -0.089661 0.27311 0.31413 -0.04081 0.060557 -0.042656 0.24178 -0.29187 0.22575 -0.6298 -0.14641 -0.22429 -0.056621 -0.17776 -0.64269 0.51626 0.22305 0.12124 0.48074 0.41743 0.54805 0.40955 0.42407 0.049906 -0.32574 0.46298 0.19245 0.28143 0.2966 0.063593 -0.11906 -0.15016 -0.04984 0.40675 0.010675 -0.69127 0.048729 0.26391 0.30961 -0.11921 0.25548 -0.28219 -0.037413 0.36461 0.027129 0.20786 0.53325 0.50148 0.72381 0.065292 -0.078716 -0.10537 -0.08081 -0.2096 0.040902 -0.88101 0.24715 0.16146 0.10361 0.19705 -0.27365 0.89902 -0.29981 0.036165 0.041238 0.60105 -0.18911 -0.43887 -0.14097 0.44073 -0.19999 0.28834 -0.25458 -0.10985 -0.0027379 0.091735 0.17021 -0.16305 -0.57439 0.37063 1.7262 -0.24656 0.51681 -0.15355 -0.15553 0.019783 0.1803 0.38178 0.094443 -0.55158 -0.20242 -0.4386 -0.42108 0.27525 0.58977 0.026655 0.16401 0.13893 -0.68692 0.51071 0.29278 0.022041 -0.18156 -0.64905 0.16923 -0.01059 0.21785 -0.27242 0.27967 0.1395 -0.70559 -0.26034 -0.44017 0.15303 0.19693 -0.096838 0.14827 1.1294 -0.31267 0.0099916 -0.48623 0.080584 0.35608 -0.19925 0.19306 -0.2004 -0.44194 0.75766 0.24487 -0.18903 0.26653 -0.21339 -0.54083 0.40532 -0.02796 -0.13398 -0.11086 0.059506 0.24052 -0.59739 -0.0024069 -0.18593 1.042 -0.12969 0.20813 0.33305 -0.1278 0.085662 -0.076422 0.31407 -0.23784 -0.054838 0.011369 0.845 -0.34165 0.093983 0.082445 -0.27777 -0.44226 -0.063078 0.37274 0.054468 0.24197 -0.040886 0.3894 -0.10509 0.23372 0.096027 -0.30324 0.24488 -0.086254 -0.41917 0.46496
public class DataLoader
{
public void Load(string filePath, int wordVectorLength, int capacity = 1_200_000)
{
_wordVectorLength = wordVectorLength;
_wordVectors = new float[capacity, wordVectorLength];
_word2Index = new Dictionary<string, int>(capacity, StringComparer.InvariantCultureIgnoreCase);
var updateDict = new ActionBlock<(string word, int pos)>(x =>
{
if (_word2Index.ContainsKey(x.word))
return;
_word2Index.Add(x.word, x.pos);
});
var actionBlock = new ActionBlock<(string line, int pos)>(x =>
{
var ind = x.line.IndexOf(' ');
var word = x.line.Substring(0, ind);
var span = x.line.AsSpan();
for (var i = 0; i < wordVectorLength; i++)
{
while (span[ind] == ' ') ind++;
var pos = ind;
while (pos < span.Length && span[pos] != ' ') pos++;
var s = span.Slice(ind, pos - ind);
_wordVectors[x.pos, i] = float.Parse(s);
ind = pos;
}
updateDict.Post((word, x.pos));
}, new ExecutionDataflowBlockOptions
{
MaxDegreeOfParallelism = Environment.ProcessorCount
});
using var fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read);
using var bufferedStream = new BufferedStream(fileStream);
using var streamReader = new StreamReader(bufferedStream, Encoding.UTF8);
string line;
var position = 0;
while ((line = streamReader.ReadLine()) != null)
actionBlock.Post((line, position++));
actionBlock.Complete();
actionBlock.Completion.GetAwaiter().GetResult();
}
private int _wordVectorLength;
private float[,] _wordVectors;
private Dictionary<string, int> _word2Index;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment