Last active
May 4, 2023 08:40
-
-
Save FlyingFathead/4621d00c4317bafb693f212a25959c9e to your computer and use it in GitHub Desktop.
text parsers for bash / gpt-2 etc training data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# idiotproofing, "lol" | |
echo "bad human/entity! this isn't inteded to be run!" | |
tail --lines=+8 ./parsers.sh | |
exit 1 | |
# =============================================================================== | |
# 1. SPLIT A LARGE FILE WITH GPT-2'S <|endoftext|> DELIMITER AS THE SPLIT POINT. | |
# =============================================================================== | |
# splitting a large text file into separate files using gpt-2's own | |
# designated <|endoftext|> -delimiters as the delimiter for file splits. | |
# awk does give a warning on some systems on this but it should be otherwise OK. | |
# Try it out with a copy of your data set in a separate folder just to be sure. | |
# Instructions: replace 'yourtextfile.txt' with your text file. | |
# The pattern inside RS=" " can be changed to your own delimiter if necessary. | |
# For help on that, see the test tool at: https://regexr.com/ | |
# Run: | |
awk '{f="file" NR; print $0 > f}' RS="\<\|endoftext\|\>" yourtextfile.txt | |
# NOTE: | |
# If you end up with i.e. a lot of empty files, use this command: | |
find . -size 0 -print -delete | |
# ... to DELETE all files in the current directory with the size of 0kb. | |
# the '-print' flag will print out the files that were found and deleted. | |
# you can also switch the '0' value to higher values if (in kb) you want to | |
# get rid of possible short snippets, accidental residual snippets, etc. | |
# ====================================================================================== | |
# 2. JOIN FILES BACK INTO A SINGLE FILE AND ADD THE <|endoftext|> BETWEEN EACH SEGMENT. | |
# ====================================================================================== | |
# putting things back together; this combines all the .txt files in | |
# a directory, and inserts a a line break, a <|endoftext|> delimiter | |
# and another line break between each merged text file's start / end. | |
#!/bin/bash | |
for file in *.txt; do | |
cat "$file" >> Combined_text.txt | |
printf '\n\n<|endoftext|>\n\n' >> Combined_text.txt | |
done | |
# ================================================================================== | |
# 3. JOIN LINE BREAKS WITHIN PASSAGES OF TEXT BUT LEAVE THE BLANK LINES UNAFFECTED. | |
# ================================================================================== | |
# (In other words, maintain the structural integrity of the text in large plaintext datasets.) | |
# From: https://stackoverflow.com/questions/39734125/sed-to-combine-n-text-lines-separated-by-blank-lines | |
# (quote) This might work for you (GNU sed): | |
sed '/./{:a;N;s/\n\(.\)/ \1/;ta}' file | |
# The command line above should give you the output of the text after processing. | |
# (quote) If the line is not empty read the following line and if that is not empty, | |
# replace the newline by a space and repeat, otherwise print the pattern space. | |
# If the line was empty in the first place print the empty line: | |
# this caters for an empty first line, if this is not the case then and there is only | |
# one empty line between non-blank lines: | |
sed ':a;N;s/\n\(.\)/ \1/;ta' file | |
# If you want sed to replace the lines in an existing file | |
# (NOTE, your file WILL get overwritten -- use a copy!), then: | |
sed -i '/./{:a;N;s/\n\(.\)/ \1/;ta}' file | |
# do this i.e. in Notepad++ with RegEx: | |
# CTRL-H => find and replace => find field: `(?<=.)\R(?=\S)` | |
# put one empty spacebar (` `) in "replace with" field | |
# make sure "regular expression" method is selected. | |
#================================================ | |
# Delete all lines between two patterns (cleanup) | |
#================================================ | |
# Easiest method is to use sed: | |
sed '/FROM_HERE/,/TO_HERE/d' # deletes line matching 'FROM_HERE' to the line matching 'TO_HERE'. Great for margin cleanups. | |
sed '/FROM_HERE/,/TO_HERE/{//p;d;}' # deletes the lines between two matches _but_ keeps the second matching line. | |
# another option ... : | |
sed 's/^.*\(consectetuer.*elit\).*$/\1/' file | |
Decoded the sed s/find/replace/ syntax: | |
s/^.* -- substitute starting at the beginning of the line (^) followed by anything (.*) up to... | |
\( - start a named block | |
consectetuer.*elit\. - match the first word, everything (.*) up to the last word (in this case, including the trailing (escaped)dot) you want to match | |
\) - end the named block | |
match everything else (.*) to the end of the line ($) | |
/ - end the substitute find section | |
\1 - replace with the name block between the \( and the \) above | |
/ - end the replace | |
# ======================================== | |
# Removing end-of-line dashes/hyphenation | |
# ======================================== | |
# Depending on how you'd want blank lines handled this: | |
awk -v RS= '{gsub(/-\n/,""); $1=$1} 1' file | |
# [More @ https://unix.stackexchange.com/questions/647648/bash-remove-dashes-and-new-lines-before-replacing-new-lines-with-spaces ] | |
# =========================================================== | |
# 4. SPLIT MASSIVE TEXT FILES BY SIZE INTO SMALLER SEGMENTS. | |
# =========================================================== | |
split -C 100m --numeric-suffixes mylongtext.txt myshortertexts-100mb- | |
# ^ segment size ^ large file in ^ shorter segment prefix | |
# The '-C 100m' defines the split size at 100 megs, | |
# "--numeric-suffixes" are just that; each split segment will be numbered. | |
# The last part is how you define the prefix of the files the split will output. | |
# ============================== | |
# insert delimiter every x lines | |
# ============================== | |
sed '0~30 s/$/\n<|endoftext|>\n/g' < "$inputfile" > "$outputfile" | |
# Every 30 lines "<|endoftext|>" is inserted with a blank line. | |
# =============================================================== | |
# A quick script to rename all files in the directory to '*.txt' | |
# =============================================================== | |
function alltotxt() { | |
echo "[WARN] We are about to do this in the current directory:" | |
for f in *; do echo mv -- "$f" "${f%.\*}.txt"; done | |
read -p "Are you sure? " -n 1 -r | |
if [[ ! $REPLY =~ ^[Yy]$ ]] | |
then | |
echo "Aborted." | |
exit 1 | |
fi | |
echo "" && | |
for f in *; do mv -- "$f" "${f%.\*}.txt"; done | |
echo "Done." | |
} | |
# ===================================== | |
# 5. ADDITIONAL SOURCE TEXT PROCESSING. | |
# ===================================== | |
# remove urls from text files; note: use 'sed -i' to replace in-file. | |
sed -e 's!http[s]\?://\S*!!g' file | |
# change all extensions to .txt in the current working directory | |
for f in *; do mv -- "$f" "${f%.\*}.txt"; done | |
# for cleaning up, remove numbers inside brackets, i.e. [652], [46], [3] | |
# and such; anything that resembles a footnote reference. | |
sed -i 's/\[[0-9][0-9][0-9]\]//g' textfile | |
sed -i 's/\[[0-9][0-9]\]//g' textfile | |
sed -i 's/\[[0-9]\]//g' textfile | |
# ========================================================================= | |
# Substitute patterns that exist between line number A and B in a text file | |
# ========================================================================= | |
# a attern between line numbers in text, i.e. in this case line nr. 155757 and 301566 | |
# '|mydelimiter|' => '> ' | |
sed '155757,301566s/[|]mydelimiter[|]/\> /g' "$file" | |
# =================================================== | |
# Parsing multiple line breaks in Notepad++ into one. | |
# =================================================== | |
# Notepad++ Multiple line breaks (CR) | |
# You can use a regular expression S&R: | |
(?:\r\n){2,} | |
# or | |
\R{2,} | |
# in the Find what field and | |
\r\n | |
# in the Replace with field. You may adjust the replacement pattern as per your needs. | |
# ========================================= | |
# Removing UTF-8/Unicode soft hyphenation. | |
# ========================================= | |
# Soft hyphenation often adds to broken tokens and other text output issues in GPT-2. | |
# It is very common to have soft hyphenation embedded inside Unicode (UTF-8) datasets without noticing it. | |
# My personal approach is to remove soft hyphenation and keep other special characters at a minimum to avoid broken tokens. | |
# The following command removes the soft hyphen Unicode character (= "0xAD") in-place with sed: | |
sed -i 's/\xAD//g' filename | |
# NOTE that this might still leave residuals; it might not be the best preferred method; see threads on soft hyphenations and raku/perl6. | |
# ================================= | |
# wikidata / wiki corpus cleansing: | |
# ================================= | |
# strip all image thumbnail related entries | |
# these are usually left scattered around even after | |
# the set has been ran through commonly used wikimedia parsers | |
# (problem seems to occur in certain Wikipedia/-media languages) | |
# remove wikimedia thumbnail picture pixel-size leftover markings: | |
sed -i 's/[0-9][0-9][0-9]px//g' wikipedia-file | |
# ==================================================== | |
# Additional parsing; for sample data; logs and such. | |
# ==================================================== | |
# Get the latest occurrence of a string and print out ONLY the text after that. | |
awk -F '<|endoftext|>' '{print $NF}' <<< "$(tail textfile.txt)" | |
# snatch text between two '<|endoftext|>' delimiters into a string variable. | |
function between() { | |
thetextfile=mytext.txt | |
latestfield=$(cat $thetextfile) | |
latestfield=${latestfield#*\<\|endoftext\|\>} # remove all up until the first <|endoftext|> instance | |
latestfield=${latestfield%%\<\|endoftext\|\>*} # remove all after the second <|endoftext|> instance | |
echo "$latestfield" | |
} | |
# get the latest-between '<|endoftext|>' -delimiters. rinse and repeat. | |
# for live feed. | |
tail -f "$latest" | while read -r b; do echo -n ${b//\<\|endoftext\|\>/}; done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment