FlyingFathead/parsers.sh

## parsers.sh
#!/bin/bash

# idiotproofing, "lol"
echo "bad human/entity! this isn't inteded to be run!"
tail --lines=+8 ./parsers.sh
exit 1

# ===============================================================================
# 1. SPLIT A LARGE FILE WITH GPT-2'S <|endoftext|> DELIMITER AS THE SPLIT POINT.
# ===============================================================================
  # splitting a large text file into separate files using gpt-2's own
  # designated <|endoftext|> -delimiters as the delimiter for file splits.

  # awk does give a warning on some systems on this but it should be otherwise OK.
  # Try it out with a copy of your data set in a separate folder just to be sure.

  # Instructions: replace 'yourtextfile.txt' with your text file.
  # The pattern inside RS=" " can be changed to your own delimiter if necessary.
  # For help on that, see the test tool at: https://regexr.com/
  # Run:

  awk '{f="file" NR; print $0 > f}' RS="\<\|endoftext\|\>" yourtextfile.txt

  # NOTE:
  # If you end up with i.e. a lot of empty files, use this command:
  find . -size 0 -print -delete

  # ... to DELETE all files in the current directory with the size of 0kb.
  # the '-print' flag will print out the files that were found and deleted.
  # you can also switch the '0' value to higher values if (in kb) you want to
  # get rid of possible short snippets, accidental residual snippets, etc.

# ======================================================================================
# 2. JOIN FILES BACK INTO A SINGLE FILE AND ADD THE <|endoftext|> BETWEEN EACH SEGMENT.
# ======================================================================================

  # putting things back together; this combines all the .txt files in
  # a directory, and inserts a a line break, a <|endoftext|> delimiter
  # and another line break between each merged text file's start / end.

  #!/bin/bash
  for file in *.txt; do
  cat "$file" >> Combined_text.txt
  printf '\n\n<|endoftext|>\n\n' >> Combined_text.txt
  done

# ==================================================================================
# 3. JOIN LINE BREAKS WITHIN PASSAGES OF TEXT BUT LEAVE THE BLANK LINES UNAFFECTED.
# ==================================================================================
# (In other words, maintain the structural integrity of the text in large plaintext datasets.)

  # From: https://stackoverflow.com/questions/39734125/sed-to-combine-n-text-lines-separated-by-blank-lines
  # (quote) This might work for you (GNU sed):

 sed '/./{:a;N;s/\n\(.\)/ \1/;ta}' file
 # The command line above should give you the output of the text after processing.

 # (quote) If the line is not empty read the following line and if that is not empty,
 # replace the newline by a space and repeat, otherwise print the pattern space.
 # If the line was empty in the first place print the empty line:
 # this caters for an empty first line, if this is not the case then and there is only
 # one empty line between non-blank lines:
 sed ':a;N;s/\n\(.\)/ \1/;ta' file

 #  If you want sed to replace the lines in an existing file
 # (NOTE, your file WILL get overwritten -- use a copy!), then:
 sed -i '/./{:a;N;s/\n\(.\)/ \1/;ta}' file

 # do this i.e. in Notepad++ with RegEx:
 # CTRL-H => find and replace => find field: `(?<=.)\R(?=\S)`
 # put one empty spacebar (` `) in "replace with" field
 # make sure "regular expression" method is selected.

#================================================
# Delete all lines between two patterns (cleanup)
#================================================
# Easiest method is to use sed:

sed '/FROM_HERE/,/TO_HERE/d' # deletes line matching 'FROM_HERE' to the line matching 'TO_HERE'. Great for margin cleanups.
sed '/FROM_HERE/,/TO_HERE/{//p;d;}' # deletes the lines between two matches _but_ keeps the second matching line.

# another option ... :

sed 's/^.*\(consectetuer.*elit\).*$/\1/' file

Decoded the sed s/find/replace/ syntax:

    s/^.* -- substitute starting at the beginning of the line (^) followed by anything (.*) up to...
    \( - start a named block
    consectetuer.*elit\. - match the first word, everything (.*) up to the last word (in this case, including the trailing (escaped)dot) you want to match
    \) - end the named block
    match everything else (.*) to the end of the line ($)
    / - end the substitute find section
    \1 - replace with the name block between the \( and the \) above
    / - end the replace

# ========================================
# Removing end-of-line dashes/hyphenation
# ========================================
# Depending on how you'd want blank lines handled this:
awk -v RS= '{gsub(/-\n/,""); $1=$1} 1' file
# [More @ https://unix.stackexchange.com/questions/647648/bash-remove-dashes-and-new-lines-before-replacing-new-lines-with-spaces ]

# ===========================================================
# 4. SPLIT MASSIVE TEXT FILES BY SIZE INTO SMALLER SEGMENTS.
# ===========================================================
 split -C 100m --numeric-suffixes mylongtext.txt myshortertexts-100mb-
#           ^ segment size          ^ large file in      ^ shorter segment prefix
# The '-C 100m' defines the split size at 100 megs,
# "--numeric-suffixes" are just that; each split segment will be numbered.
# The last part is how you define the prefix of the files the split will output.

# ==============================
# insert delimiter every x lines
# ==============================

sed '0~30 s/$/\n<|endoftext|>\n/g' < "$inputfile" > "$outputfile"

# Every 30 lines "<|endoftext|>" is inserted with a blank line.

# ===============================================================
# A quick script to rename all files in the directory to '*.txt'
# ===============================================================
function alltotxt() {
echo "[WARN] We are about to do this in the current directory:"
for f in *; do echo mv -- "$f" "${f%.\*}.txt"; done
read -p "Are you sure? " -n 1 -r
if [[ ! $REPLY =~ ^[Yy]$ ]]
then
    echo "Aborted."
    exit 1
fi
echo "" &&
for f in *; do mv -- "$f" "${f%.\*}.txt"; done
echo "Done."
}

# =====================================
# 5. ADDITIONAL SOURCE TEXT PROCESSING.
# =====================================
 # remove urls from text files; note: use 'sed -i' to replace in-file.
 sed -e 's!http[s]\?://\S*!!g' file

 # change all extensions to .txt in the current working directory
 for f in *; do mv -- "$f" "${f%.\*}.txt"; done

 # for cleaning up, remove numbers inside brackets, i.e. [652], [46], [3]
 # and such; anything that resembles a footnote reference.
 sed -i 's/\[[0-9][0-9][0-9]\]//g' textfile
 sed -i 's/\[[0-9][0-9]\]//g' textfile
 sed -i 's/\[[0-9]\]//g' textfile

# =========================================================================
# Substitute patterns that exist between line number A and B in a text file
# =========================================================================
# a attern between line numbers in text, i.e. in this case line nr. 155757 and 301566
# '|mydelimiter|' => '> '

sed '155757,301566s/[|]mydelimiter[|]/\> /g' "$file"

# ===================================================
# Parsing multiple line breaks in Notepad++ into one.
# ===================================================
# Notepad++ Multiple line breaks (CR)
# You can use a regular expression S&R:
(?:\r\n){2,}
# or
\R{2,}
# in the Find what field and
\r\n
# in the Replace with field. You may adjust the replacement pattern as per your needs.

# =========================================
# Removing UTF-8/Unicode soft hyphenation.
# =========================================
# Soft hyphenation often adds to broken tokens and other text output issues in GPT-2.
# It is very common to have soft hyphenation embedded inside Unicode (UTF-8) datasets without noticing it.
# My personal approach is to remove soft hyphenation and keep other special characters at a minimum to avoid broken tokens.

# The following command removes the soft hyphen Unicode character (= "0xAD") in-place with sed:
sed -i 's/\xAD//g' filename
# NOTE that this might still leave residuals; it might not be the best preferred method; see threads on soft hyphenations and raku/perl6.

# =================================
# wikidata / wiki corpus cleansing:
# =================================
# strip all image thumbnail related entries
# these are usually left scattered around even after
# the set has been ran through commonly used wikimedia parsers
# (problem seems to occur in certain Wikipedia/-media languages)

# remove wikimedia thumbnail picture pixel-size leftover markings:
sed -i 's/[0-9][0-9][0-9]px//g' wikipedia-file

# ====================================================
# Additional parsing; for sample data; logs and such.
# ====================================================
# Get the latest occurrence of a string and print out ONLY the text after that.
awk -F '<|endoftext|>' '{print $NF}' <<< "$(tail textfile.txt)"

# snatch text between two '<|endoftext|>' delimiters into a string variable.
function between() {
thetextfile=mytext.txt
latestfield=$(cat $thetextfile)
latestfield=${latestfield#*\<\|endoftext\|\>} # remove all up until the first <|endoftext|> instance
latestfield=${latestfield%%\<\|endoftext\|\>*} # remove all after the second <|endoftext|> instance
echo "$latestfield"
}

# get the latest-between '<|endoftext|>' -delimiters. rinse and repeat.
# for live feed.
tail -f "$latest" | while read -r b; do echo -n ${b//\<\|endoftext\|\>/}; done
	#!/bin/bash

	# idiotproofing, "lol"
	echo "bad human/entity! this isn't inteded to be run!"
	tail --lines=+8 ./parsers.sh
	exit 1

	# ===============================================================================
	# 1. SPLIT A LARGE FILE WITH GPT-2'S <\|endoftext\|> DELIMITER AS THE SPLIT POINT.
	# ===============================================================================
	# splitting a large text file into separate files using gpt-2's own
	# designated <\|endoftext\|> -delimiters as the delimiter for file splits.

	# awk does give a warning on some systems on this but it should be otherwise OK.
	# Try it out with a copy of your data set in a separate folder just to be sure.

	# Instructions: replace 'yourtextfile.txt' with your text file.
	# The pattern inside RS=" " can be changed to your own delimiter if necessary.
	# For help on that, see the test tool at: https://regexr.com/
	# Run:

	awk '{f="file" NR; print $0 > f}' RS="\<\\|endoftext\\|\>" yourtextfile.txt

	# NOTE:
	# If you end up with i.e. a lot of empty files, use this command:
	find . -size 0 -print -delete

	# ... to DELETE all files in the current directory with the size of 0kb.
	# the '-print' flag will print out the files that were found and deleted.
	# you can also switch the '0' value to higher values if (in kb) you want to
	# get rid of possible short snippets, accidental residual snippets, etc.

	# ======================================================================================
	# 2. JOIN FILES BACK INTO A SINGLE FILE AND ADD THE <\|endoftext\|> BETWEEN EACH SEGMENT.
	# ======================================================================================

	# putting things back together; this combines all the .txt files in
	# a directory, and inserts a a line break, a <\|endoftext\|> delimiter
	# and another line break between each merged text file's start / end.

	#!/bin/bash
	for file in *.txt; do
	cat "$file" >> Combined_text.txt
	printf '\n\n<\|endoftext\|>\n\n' >> Combined_text.txt
	done

	# ==================================================================================
	# 3. JOIN LINE BREAKS WITHIN PASSAGES OF TEXT BUT LEAVE THE BLANK LINES UNAFFECTED.
	# ==================================================================================
	# (In other words, maintain the structural integrity of the text in large plaintext datasets.)

	# From: https://stackoverflow.com/questions/39734125/sed-to-combine-n-text-lines-separated-by-blank-lines
	# (quote) This might work for you (GNU sed):

	sed '/./{:a;N;s/\n\(.\)/ \1/;ta}' file
	# The command line above should give you the output of the text after processing.

	# (quote) If the line is not empty read the following line and if that is not empty,
	# replace the newline by a space and repeat, otherwise print the pattern space.
	# If the line was empty in the first place print the empty line:
	# this caters for an empty first line, if this is not the case then and there is only
	# one empty line between non-blank lines:
	sed ':a;N;s/\n\(.\)/ \1/;ta' file

	# If you want sed to replace the lines in an existing file
	# (NOTE, your file WILL get overwritten -- use a copy!), then:
	sed -i '/./{:a;N;s/\n\(.\)/ \1/;ta}' file

	# do this i.e. in Notepad++ with RegEx:
	# CTRL-H => find and replace => find field: `(?<=.)\R(?=\S)`
	# put one empty spacebar (` `) in "replace with" field
	# make sure "regular expression" method is selected.

	#================================================
	# Delete all lines between two patterns (cleanup)
	#================================================
	# Easiest method is to use sed:

	sed '/FROM_HERE/,/TO_HERE/d' # deletes line matching 'FROM_HERE' to the line matching 'TO_HERE'. Great for margin cleanups.
	sed '/FROM_HERE/,/TO_HERE/{//p;d;}' # deletes the lines between two matches _but_ keeps the second matching line.

	# another option ... :

	sed 's/^.\(consectetuer.elit\).*$/\1/' file

	Decoded the sed s/find/replace/ syntax:

	s/^.* -- substitute starting at the beginning of the line (^) followed by anything (.*) up to...
	\( - start a named block
	consectetuer.elit\. - match the first word, everything (.) up to the last word (in this case, including the trailing (escaped)dot) you want to match
	\) - end the named block
	match everything else (.*) to the end of the line ($)
	/ - end the substitute find section
	\1 - replace with the name block between the \( and the \) above
	/ - end the replace

	# ========================================
	# Removing end-of-line dashes/hyphenation
	# ========================================
	# Depending on how you'd want blank lines handled this:
	awk -v RS= '{gsub(/-\n/,""); $1=$1} 1' file
	# [More @ https://unix.stackexchange.com/questions/647648/bash-remove-dashes-and-new-lines-before-replacing-new-lines-with-spaces ]

	# ===========================================================
	# 4. SPLIT MASSIVE TEXT FILES BY SIZE INTO SMALLER SEGMENTS.
	# ===========================================================
	split -C 100m --numeric-suffixes mylongtext.txt myshortertexts-100mb-
	# ^ segment size ^ large file in ^ shorter segment prefix
	# The '-C 100m' defines the split size at 100 megs,
	# "--numeric-suffixes" are just that; each split segment will be numbered.
	# The last part is how you define the prefix of the files the split will output.

	# ==============================
	# insert delimiter every x lines
	# ==============================

	sed '0~30 s/$/\n<\|endoftext\|>\n/g' < "$inputfile" > "$outputfile"

	# Every 30 lines "<\|endoftext\|>" is inserted with a blank line.

	# ===============================================================
	# A quick script to rename all files in the directory to '*.txt'
	# ===============================================================
	function alltotxt() {
	echo "[WARN] We are about to do this in the current directory:"
	for f in ; do echo mv -- "$f" "${f%.\}.txt"; done
	read -p "Are you sure? " -n 1 -r
	if [[ ! $REPLY =~ ^[Yy]$ ]]
	then
	echo "Aborted."
	exit 1
	fi
	echo "" &&
	for f in ; do mv -- "$f" "${f%.\}.txt"; done
	echo "Done."
	}

	# =====================================
	# 5. ADDITIONAL SOURCE TEXT PROCESSING.
	# =====================================
	# remove urls from text files; note: use 'sed -i' to replace in-file.
	sed -e 's!http[s]\?://\S*!!g' file

	# change all extensions to .txt in the current working directory
	for f in ; do mv -- "$f" "${f%.\}.txt"; done

	# for cleaning up, remove numbers inside brackets, i.e. [652], [46], [3]
	# and such; anything that resembles a footnote reference.
	sed -i 's/\[[0-9][0-9][0-9]\]//g' textfile
	sed -i 's/\[[0-9][0-9]\]//g' textfile
	sed -i 's/\[[0-9]\]//g' textfile

	# =========================================================================
	# Substitute patterns that exist between line number A and B in a text file
	# =========================================================================
	# a attern between line numbers in text, i.e. in this case line nr. 155757 and 301566
	# '\|mydelimiter\|' => '> '

	sed '155757,301566s/[\|]mydelimiter[\|]/\> /g' "$file"

	# ===================================================
	# Parsing multiple line breaks in Notepad++ into one.
	# ===================================================
	# Notepad++ Multiple line breaks (CR)
	# You can use a regular expression S&R:
	(?:\r\n){2,}
	# or
	\R{2,}
	# in the Find what field and
	\r\n
	# in the Replace with field. You may adjust the replacement pattern as per your needs.

	# =========================================
	# Removing UTF-8/Unicode soft hyphenation.
	# =========================================
	# Soft hyphenation often adds to broken tokens and other text output issues in GPT-2.
	# It is very common to have soft hyphenation embedded inside Unicode (UTF-8) datasets without noticing it.
	# My personal approach is to remove soft hyphenation and keep other special characters at a minimum to avoid broken tokens.

	# The following command removes the soft hyphen Unicode character (= "0xAD") in-place with sed:
	sed -i 's/\xAD//g' filename
	# NOTE that this might still leave residuals; it might not be the best preferred method; see threads on soft hyphenations and raku/perl6.

	# =================================
	# wikidata / wiki corpus cleansing:
	# =================================
	# strip all image thumbnail related entries
	# these are usually left scattered around even after
	# the set has been ran through commonly used wikimedia parsers
	# (problem seems to occur in certain Wikipedia/-media languages)

	# remove wikimedia thumbnail picture pixel-size leftover markings:
	sed -i 's/[0-9][0-9][0-9]px//g' wikipedia-file

	# ====================================================
	# Additional parsing; for sample data; logs and such.
	# ====================================================
	# Get the latest occurrence of a string and print out ONLY the text after that.
	awk -F '<\|endoftext\|>' '{print $NF}' <<< "$(tail textfile.txt)"

	# snatch text between two '<\|endoftext\|>' delimiters into a string variable.
	function between() {
	thetextfile=mytext.txt
	latestfield=$(cat $thetextfile)
	latestfield=${latestfield#*\<\\|endoftext\\|\>} # remove all up until the first <\|endoftext\|> instance
	latestfield=${latestfield%%\<\\|endoftext\\|\>*} # remove all after the second <\|endoftext\|> instance
	echo "$latestfield"
	}

	# get the latest-between '<\|endoftext\|>' -delimiters. rinse and repeat.
	# for live feed.
	tail -f "$latest" \| while read -r b; do echo -n ${b//\<\\|endoftext\\|\>/}; done