Skip to content

Instantly share code, notes, and snippets.

@fabiolimace
Created December 9, 2023 13:52
Show Gist options
  • Save fabiolimace/c1210534a4fb49186b093fa2766fc9ea to your computer and use it in GitHub Desktop.
Save fabiolimace/c1210534a4fb49186b093fa2766fc9ea to your computer and use it in GitHub Desktop.
List fixed tokens of a file, those which are not alpnanumeric only
#!/usr/bin/gawk -f
NF {
for (i = 1; i <= NF; i++) {
if ( $i !~ /^[[:alnum:]]+$/ ) {
print $i;
}
}
}
@fabiolimace
Copy link
Author

OUTPUT SAMPLE:

shuf -n 100 output.txt 
      1 fofos!'
      6 207,7
      1 milicianos."
      1 corpus”.
      1 KK.O
     15 'Manchester
      4 roletas/catracas
      2 drástico.
      1 cabelo!!
      1 5.265
      1 federais".
      2 imunocomprometidas.
      7 responsabilidades.
      4 Ex-vice-governador
      1 #foryou
      1 Araujo/CNA
     35 'Lei
      1 A.P.,
      2 "Exercite-se
      1 mosquito!
      1 Informação)
      1 desvendada,
      1 arrasaram!",
      8 grisalhos,
      2 (@mcsabrinaoficial)
      9 Amália,
      1 (MBA).
      1 Elói,
      5 ‘que
      3 homossexualidade,
      1 criança-
     18 'pressão
      2 #profissionaisdasaude
      1 mar”.
      1 Thooth'
      1 Vantagens.
      1 não-tripulada
     13 gostoso.
      2 "Mantenha
      1 JEB’s
      1 insultados,
      9 risada.
      1 elencopic.twitter.com/rKcK6rshtR
      2 manchetes.
      2 XS,
      1 charlatanismo;
      1 alteração",
      1 reconhecendo,
      1 encomendou,
     12 'Pediria
      3 (06/11),
      5 Bonemer,
     72 Bittencourt,
      1 fronteiriça,
      4 narração,
      6 bola!
      1 deflagrado.
      4 patas,
      2 "gentrificação".
      1 #Mesquita.
      1 NTV.
      4 'Tome
      7 Gaza;
      1 7,40,
      1 (Sinaval)
     13 Mágica'
      1 quarteirão",
      6 bombástica,
      8 Ceciliano:
      1 outorgada;
      4 5,56.
      1 forra.
     13 Metrópoles.
     36 desabrigadas,
      1 "matérias
      2 tamanhos.
     16 (PSD-MA)
      1 7/9;
      3 (Facebook
      1 trajetórias,
      2 vai'.
      1 Acidente;
      1 saliente,
      1 Digital.“A
      3 'curtir
      2 protocolado,
      1 Tainhas,
     11 lançar,
      1 pic.twitter.com/CRIU0BSbZn
      1 29.000
      4 tática'
      1 Beija-Flor),
      3 bagageiro,
      1 91,3%.
     25 Galaxy,
     10 Ex-sisters
      2 (Denatran)
      1 Neonatologia,
      2 ##boyfriends
      1 seletivo?

@fabiolimace
Copy link
Author

Utilization example: find all token patterns which include round brackets.

cat *.mixed.tokens.txt | grep -E --color=auto '[()]' | awk '{print $2}' | sed 's/[[:alnum:]-]//g' | sort | uniq -c | sort | tac | head -n 500
  50775 )
  47512 (
  37302 ()
  34230 ),
  31194 ).
  25420 (),
  15718 ().
  12324 (,
   7065 /)
   5469 );
   3008 (/)
   2914 ();
   2881 .)
   2576 (,%)
   2506 ):
   2493 ():
   2190 //).
   2190 //)
   2088 (,%),
   1998 .).
   1975 /).
   1798 (@)
   1798 (.),
   1650 (/),
   1609 .),
   1579 (,%).
   1554 ("
   1504 /),
   1421 (.
   1417 (.)
   1403 (,%
   1211 ..).
   1100 ./).
   1086 ./),
   1076 (/).
   1061 (.).
   1045 ./)
    998 ")
    974 (“
    907 ..)
    879 "),
    814 ,)
    747 (%)
    727 ").
    676 )",
    673 )".
    670 (:
    656 ("")
    618 (/
    612 ,).
    604 )."
    581 ”)
    581 (+,%)
    580 ,%)
    513 )?
    512 !)
    511 (//)
    510 (%),
    469 ,%),
    463 ()?
    463 (%).
    454 ”).
    445 (%
    442 ,%).
    424 (""),
    422 //),
    406 (,%);
    400 ”),
    398 ..),
    379 ?)
    356 ,),
    348 "")
    338 (;
    332 (↑,%)
    317 )"
    304 (,)
    293 !),
    291 (./)
    291 (+,%),
    289 ()",
    288 !).
    283 ?).
    279 %)
    262 (+,%).
    255 “”)
    255 .,).
    252 .);
    250 /.)
    249 (..)
    238 ("").
    235 (.,
    231 ?),
    231 %).
    226 (//),
    222 ('
    221 //.)
    218 ()".
    216 (,),
    211 ""),
    204 (↓,%)
    203 (./),
    193 (./).
    186 ...)
    186 .,)
    186 .(
    185 )”,
    179 (“”)
    178 "").
    177 .,),
    176 (@_)
    169 (/,
    169 (..
    166 %),
    165 /):
    164 /);
    164 ./);
    154 (....,
    152 //);
    150 (%);
    148 “”).
    146 (...)
    142 (..),
    140 ....).
    136 ()."
    134 "(
    132 ...).
    130 (,).
    128 ....)
    123 ()”,
    121 (/);
    119 (..).
    117 ')
    114 “”),
    114 (//).
    114 (+,%
    109 ()"
    106 (+,%);
    104 )”.
    104 "()
    103 (!)
    100 (+%)
     99 .):
     99 (""
     98 (/):
     94 )!
     93 (“”,
     92 ..);
     91 '),
     90 (//
     89 ....),
     89 (?)
     89 ($
     87 ...),
     83 :)
     83 ../).
     83 (...).
     82 (↑%)
     81 ("",
     78 ");
     77 ./):
     77 (.);
     76 (@.)
     76 (....)
     75 (//):
     74 ;)
     74 (&)
     73 ."(
     73 (+%
     73 ()!
     72 ../),
     72 (+)
     70 ()”.
     68 ()/
     67 //):
     66 …)
     66 (“”
     65 +)
     64 //)".
     64 (./
     64 (+%).
     63 (“”),
     62 ../)
     62 ()”
     61 .(...)
     58 (’
     58 (:)
     57 (“”).
     57 (,%,
     56 (...),
     56 (+
     55 ,);
     55 (+,
     55 ())
     54 //)"
     54 /)."
     54 )”
     54 (+%),
     53 (%,
     52 //)."
     52 .()
     52 %);
     51 :).
     51 (&),
     50 (–)
     49 (....),
     48 //.)"
     48 (↓%)
     48 (@),
     48 ').
     47 )/
     47 (....
     47 (..,
     47 ()...
     47 (')
     47 (!).
     46 (",
     45 (....).
     44 /"),
     44 .,);
     43 ."()
     43 )...
     41 (...)".
     41 (...
     40 ,%);
     40 "(...)
     39 …).
     38 ).”
     38 ()()
     38 '')
     37 ./"),
     37 ))
     36 @)
     36 (@).
     35 /)".
     35 ./)".
     35 ..)."
     34 )*
     34 (::)
     34 (+).
     34 ('),
     33 ’),
     33 :),
     33 .():
     33 ,/)
     33 ('')
     33 ''),
     33 $,)
     32 ”);
     32 (...)"
     32 "");
     31 §).
     31 ..)"
     31 (:),
     31 (/.
     30 ()),
     30 (&).
     30 (!),
     29 )";
     29 (°
     29 (?
     28 ..):
     28 .)"
     28 ($,
     28 (#)
     28 '').
     27 .)".
     27 +).
     27 (/.)
     26 ’)
     26 ./.).
     26 (/.,
     26 (.:
     26 (./,
     26 ($)
     26 (!
     25 ?);
     25 +),
     25 ). 
     25 (?).
     24 )..
     24 (“,
     24 (+%);
     24 ('/)
     24 ("");
     23 :(...)
     23 .)."
     23 )…
     23 (@__)
     23 ()).
     22 °).
     22 °)
     22 (↓,%
     22 (↑,%
     22 ().:
     22 '/)
     21 §),
     21 :(
     21 ..)".
     21 .(/):
     21 .").
     21 .")
     21 (=
     21 (+),
     21 (#).
     21 ".(
     21 "(.
     20 °),
     20 §)
     20 /)"
     20 /")
     20 ./)."
     20 ..,),
     20 (§
     20 (;)
     20 ("/"),
     19 ”):
     19 /@)
     19 .(,
     19 (×
     19 (°)
     19 '/),
     19 &)
     18 ;()
     18 ....);
     18 ."),
     18 ,/),
     18 +,%)
     18 (//,
     18 ($.,).
     18 (#),
     17 ’).
     17 (§).
     17 (.):
     17 ().”
     17 ().#
     17 (''),
     17 (%):
     16 …),
     16 “()
     16 ;():.
     16 ://)
     16 ///)
     16 /.).
     16 .(/)
     16 )'
     16 (:).
     16 ((
     16 ($,).
     15 ”.)
     15 §);
     15 ,/).
     15 ,()
     15 ).*
     15 (“”);
     15 ().,
     15 ("/")
     14 “”);
     14 ?"),
     14 :()
     14 /").
     14 .”)
     14 ./.),
     14 ./.)
     14 ..,).
     14 ..,)
     14 )","":"(,
     14 (_)
     14 (?),
     14 (;).
     14 (...)."
     14 (+,%,
     14 (+);
     14 '/);
     14 &),
     14 $)
     14 "(,
     13 “(
     13 /.),
     13 ./)"
     13 .(;
     13 .(...)"
     13 (./.,
     13 (,,
     13 (''
     13 (&
     13 ($.
     13 (""):
     13 ".)
     13 "):
     13 !”).
     12 ])
     12 [])
     12 ?”).
     12 ,):
     12 ).•
     12 )'.
     12 (’)
     12 (§,
     12 (§)
     12 (@):
     12 (...,
     12 (.,)
     12 (*
     12 (').
     12 (&,
     12 ($.,
     12 ($),
     11 .../).
     11 .(...).
     11 .(.)
     11 .(.
     11 +/)
     11 )•
     11 ).,
     11 (“:
     11 (‘
     11 (°),
     11 (~
     11 (@_),
     11 (://..//)
     11 (.,%
     11 (*)
     11 ()',
     11 &).
     11 %):
     11 $,).
     11 #)
     11 !):
     10 ²)
     10 _)
     10 ?!),
     10 ?!)
     10 :);
     10 ./")
     10 ..../).
     10 ...);
     10 ).".
     10 )).
     10 )),
     10 )#
     10 (…)
     10 (×)
     10 (§),
     10 (@..)
     10 (/;
     10 (/:
     10 (///)
     10 (./):
     10 (.../)
     10 (.)"
     10 ()#
     10 (()
     10 ($.,)
     10 (")
     10 ("'
     10 (""),(//////.)
     10 $).
     10 !);
      9 °);
      9 @).
      9 //.).
      9 ..,);
      9 .().
      9 (…),
      9 (³/),
      9 (=)
      9 (,")
      9 (+/)
      9 ().“
      9 ()..
      9 ($.)
      9 ($,),
      9 (":
      9 (!,
      9 $.,).
      9 !”)
      8 _/)
      8 ]);
      8 []).
      8 []),
      8 @_)
      8 ?").
      8 ?")
      8 ://./)
      8 //.);
      8 //)..
      8 /+)
      8 /()
      8 .;)
      8 .//)
      8 ./)",
      8 .....).
      8 ...").
      8 ."(,
      8 ).“
      8 ).:
      8 (−,%)
      8 (↑%
      8 (@.).
      8 (;),

@fabiolimace
Copy link
Author

Another usage: list only the previous patterns which start with a closing bracket:

cat *.mixed.tokens.txt | grep -E --color=auto '[()]' | awk '{print $2}' | sed 's/[[:alnum:]-]//g' | grep '^).*$' | sort | uniq -c | sort | tac | awk '$1 >= 10'
  50775 )
  34230 ),
  31194 ).
   5469 );
   2506 ):
    676 )",
    673 )".
    604 )."
    513 )?
    317 )"
    185 )”,
    104 )”.
     94 )!
     54 )”
     47 )/
     43 )...
     38 ).”
     37 ))
     34 )*
     29 )";
     25 ). 
     24 )..
     23 )…
     16 )'
     15 ).*
     14 )","":"(,
     12 ).•
     12 )'.
     11 )•
     11 ).,
     10 ).".
     10 )).
     10 )),
     10 )#

@fabiolimace
Copy link
Author

Another usage: the contrary.

cat *.mixed.tokens.txt | grep -E --color=auto '[()]' | awk '{print $2}' | sed 's/[[:alnum:]-]//g' | grep '^.*($' | sort | uniq -c | sort | tac | awk '$1 >= 5'
  47512 (
    186 .(
    134 "(
     73 ."(
     21 :(
     21 ".(
     16 ((
     13 “(
      7 [(
      7 ,(
      6 ?(
      6 ;(
      6 ./(
      6 .,(
      6 ).(
      5 ”(
      5 /(
      5 ..(
      5 %(

@fabiolimace
Copy link
Author

Another usage: replace all groups of alphanumerics chars with 'a'.

cat *.mixed.tokens.txt | awk '{print $2}' | sed -E 's/[[:alnum:]-]+/a/g' | sort | uniq -c | sort | tac | head -n 100
 507817 a,
 353375 a.
  99448 a:
  95085 a
  59447 a;
  57525 "a
  50199 a)
  48918 a.a
  48288 a?
  47675 a,a
  47599 a",
  47215 (a
  39301 “a
  35448 a/a
  35136 a".
  34452 (a)
  33975 a),
  33481 a”,
  32763 a"
  30400 a).
  28052 "a"
  25635 “a”
  24726 (a),
  24025 a”
  22246 a!
  21300 a”.
  18659 a."
  18228 a,a%
  16376 a/a,
  14942 (a).
  14562 a.a,
  14464 'a
  14233 a.a.a/a
  13697 a,a.
  12917 a'
  10958 (a,
  10103 "a",
   9895 a.a/a,
   9190 “a”,
   8910 a/a.
   8888 'a'
   8579 a.a/a
   8159 a/a/a
   8074 a...
   7795 a.a.
   7649 a,a%.
   7601 a.a.a
   7468 a/a/a,
   7284 a,a,
   7014 a/a)
   6943 a,a%,
   6894 "a".
   6636 a.a,a
   6537 “a”.
   6373 a.a.a.a.a
   6312 #a
   6258 a…
   5909 a.”
   5890 a',
   5592 ‘a
   5402 a);
   5027 a/a/a.
   4643 "a,
   4519 a.a.a,
   4273 ‘a’
   4234 a’
   3766 a]
   3611 a.a.a.
   3521 a'a
   3325 a.a/a.
   3157 a:a
   2975 (a/a)
   2916 @a
   2880 [a
   2843 (a);
   2795 a.a;
   2722 a://a.a/a
   2663 a.a.a.a.a,
   2574 (a,a%)
   2537 a%
   2476 a'.
   2428 a):
   2343 (a):
   2282 a.a,a.
   2232 a,a%;
   2189 a"a.
   2176 a/a/a)
   2161 a/a/a).
   2081 (a,a%),
   2024 a’a
   1978 “a,
   1975 *a
   1956 [a]
   1937 a/a).
   1896 a/a/a;
   1892 a’,
   1888 a.a)
   1888 'a',
   1792 (@a)
   1788 a.a).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment