Skip to content

Instantly share code, notes, and snippets.

@arjunguha
Created December 30, 2023 22:12
Show Gist options
  • Save arjunguha/95839fe6523ef2beb29637a33f49a678 to your computer and use it in GitHub Desktop.
Save arjunguha/95839fe6523ef2beb29637a33f49a678 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/root/venv/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from datasets import load_dataset\n",
"import pygments\n",
"import pandas as pd"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"The list comes from the web page for the dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ALL_LANGS = [\n",
" 'ada', 'agda', 'alloy', 'antlr', 'applescript', 'assembly', 'augeas', 'awk', 'batchfile', 'bison', 'bluespec', 'c',\n",
"'c++', 'c-sharp', 'clojure', 'cmake', 'coffeescript', 'common-lisp', 'css', 'cuda', 'dart', 'dockerfile', 'elixir',\n",
"'elm', 'emacs-lisp','erlang', 'f-sharp', 'fortran', 'glsl', 'go', 'groovy', 'haskell','html', 'idris', 'isabelle', 'java', \n",
"'java-server-pages', 'javascript', 'julia', 'kotlin', 'lean', 'literate-agda', 'literate-coffeescript', 'literate-haskell',\n",
" 'lua', 'makefile', 'maple', 'markdown', 'mathematica', 'matlab', 'ocaml', 'pascal', 'perl', 'php', 'powershell', 'prolog',\n",
" 'protocol-buffer', 'python', 'r', 'racket', 'restructuredtext', 'rmarkdown', 'ruby', 'rust', 'sas', 'scala', 'scheme', \n",
" 'shell', 'smalltalk', 'solidity', 'sparql', 'sql', 'stan', 'standard-ml', 'stata', 'systemverilog', 'tcl', 'tcsh', 'tex', \n",
" 'thrift', 'typescript', 'verilog', 'vhdl', 'visual-basic', 'xslt', 'yacc', 'zig'\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/ada/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/agda/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/alloy/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/antlr/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/applescript/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/assembly/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/augeas/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/awk/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/batchfile/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/bison/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/bluespec/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/c/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/c++/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/c-sharp/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/clojure/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/cmake/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/coffeescript/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/common-lisp/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/css/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/cuda/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/dart/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/dockerfile/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/elixir/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/elm/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/emacs-lisp/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/erlang/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/f-sharp/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/fortran/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/glsl/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/go/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/groovy/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/haskell/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/html/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/idris/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/isabelle/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/java/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/java-server-pages/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/javascript/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/julia/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/kotlin/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/lean/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/literate-agda/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/literate-coffeescript/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/literate-haskell/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/lua/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/makefile/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/maple/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/markdown/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/mathematica/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/matlab/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/ocaml/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/pascal/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/perl/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/php/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/powershell/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/prolog/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/protocol-buffer/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/python/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/r/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/racket/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/restructuredtext/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/rmarkdown/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/ruby/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/rust/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/sas/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/scala/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/scheme/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/shell/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/smalltalk/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/solidity/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/sparql/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/sql/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/stan/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/standard-ml/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/stata/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/systemverilog/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/tcl/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/tcsh/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/tex/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/thrift/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/typescript/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/verilog/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/vhdl/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/visual-basic/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/xslt/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/yacc/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n",
"Found cached dataset the-stack-smol-xs (/root/.cache/huggingface/datasets/bigcode___the-stack-smol-xs/zig/1.0.0/ac1274377605fe676fc7ec99e82e11f22ee29fcc9d3297d78193aca3ac4ad16f)\n"
]
}
],
"source": [
"ds = {lang: load_dataset(\"bigcode/the-stack-smol-xs\", lang, split=\"train\") for lang in ALL_LANGS}"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"lmap = {'c-sharp':'csharp', 'f-sharp':'fsharp', 'standard-ml':'sml', 'batchfile':'batch','java-server-pages':'jsp'}\n",
"\n",
"def pygments_language_id_to_thestack_language_id(str):\n",
" if str in lmap:\n",
" return lmap[str]\n",
" return str\n",
"\n",
"def can_lex_without_errors(lexer, contents: str):\n",
" tokens = pygments.lex(contents, lexer)\n",
" for (tok_type, tok_text) in tokens:\n",
" if tok_type == pygments.token.Token.Error:\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['content', 'lang', 'size', 'ext', 'max_stars_count', 'avg_line_length', 'max_line_length', 'alphanum_fraction'])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ds[\"ada\"][0].keys()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"no lexer for assembly\n",
"no lexer for bison\n",
"no lexer for bluespec\n",
"no lexer for literate-coffeescript\n",
"no lexer for maple\n",
"no lexer for protocol-buffer\n",
"no lexer for rmarkdown\n",
"no lexer for visual-basic\n",
"no lexer for yacc\n"
]
}
],
"source": [
"from pathlib import Path\n",
"\n",
"for language in ds:\n",
" try:\n",
" lexer = pygments.lexers.get_lexer_by_name(pygments_language_id_to_thestack_language_id(language))\n",
" except pygments.util.ClassNotFound:\n",
" print(f\"no lexer for {language}\")\n",
" continue\n",
" for (index, sample) in enumerate(ds[language]):\n",
" if not can_lex_without_errors(lexer, sample[\"content\"]):\n",
" dir_path = Path(\"unlexable_files\") / language\n",
" if not dir_path.exists():\n",
" dir_path.mkdir()\n",
" (dir_path / f\"{index}.{sample['ext']}\").write_text(sample[\"content\"])"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"language\n",
"ada 8\n",
"agda 8\n",
"alloy 6\n",
"antlr 37\n",
"applescript 10\n",
"awk 24\n",
"c 11\n",
"cmake 5\n",
"coffeescript 2\n",
"common-lisp 2\n",
"css 30\n",
"dart 3\n",
"dockerfile 2\n",
"elixir 24\n",
"elm 8\n",
"emacs-lisp 6\n",
"erlang 3\n",
"fortran 26\n",
"glsl 33\n",
"groovy 1\n",
"haskell 4\n",
"html 9\n",
"idris 14\n",
"javascript 17\n",
"julia 3\n",
"kotlin 13\n",
"literate-haskell 9\n",
"lua 3\n",
"makefile 8\n",
"markdown 5\n",
"mathematica 33\n",
"ocaml 1\n",
"pascal 5\n",
"perl 9\n",
"powershell 1\n",
"prolog 9\n",
"r 3\n",
"racket 10\n",
"sas 19\n",
"scala 3\n",
"scheme 7\n",
"smalltalk 42\n",
"solidity 29\n",
"sparql 13\n",
"sql 17\n",
"stata 1\n",
"tcl 48\n",
"thrift 4\n",
"typescript 19\n",
"vhdl 12\n",
"zig 12\n",
"dtype: int64"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"errors_df = pd.DataFrame(errors).drop(columns=[\"index\"])\n",
"errors_df.groupby(\"language\").size()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "4b195c9ba378eb519b14e7f259b82f2dffeee53eaf931c5b8aa204642c58cd1a"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment