conceptofmind/c4_filters.py

## c4_filters.py
import re
import nltk
import ftfy
import multiprocessing
from datasets import load_dataset
from langdetect import detect_langs
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download("punkt")

whitespace={
    " ",
    " ",
    " ",
    " ",
    " ",
    "　",
    " ",
    " ",
    " ",
    " ",
    "",
    "",
}

def is_not_empty(example):
    return len(example['text']) > 0

def is_terminal_punctuation(line):
    return bool(re.search(r'[\.\?!"]\s*$', line))

def is_valid_sentence(sentence):
    words = word_tokenize(sentence)
    return len(words) >= 3

def contains_javascript(sentence):
    return bool(re.search(r'\b(?:java\s*script|JS)\b', sentence, re.IGNORECASE))

def contains_lorem_ipsum(sentence):
    return bool(re.search(r'\b(?:lorem\s*ipsum)\b', sentence, re.IGNORECASE))

def contains_curly_bracket(sentence):
    return bool(re.search(r'[{}]', sentence))

def is_english(sentence):
    languages = detect_langs(sentence)
    return any(lang.lang == 'en' and lang.prob >= 0.99 for lang in languages)

# def contains_warning(sentence):
#     return bool(re.search(r'WARNING:\s*THE\s*EDGAR\s*SYSTEM\s*ENCOUNTERED\s*ERROR\(S\)\s*WHILE\s*PROCESSING\s*THIS\s*SCHEDULE\.', sentence, re.IGNORECASE))

def contains_url(sentence):
    url_pattern = r'(?:(?:http|https|ftp):\/\/|www\.)[\w/\-?=%.]+\.[\w/\-?=%.]+'
    return bool(re.search(url_pattern, sentence))

def contains_phone_number(sentence):
    phone_number_pattern = r'\b(?:\+\d{1,3})?[-. (]*(?:\d{1,3})?[-. )]*(?:\d{2,5})[-. (]*(?:\d{2,5})[-. )]*(?:\d{2,5})\b'
    return bool(re.search(phone_number_pattern, sentence))

def remove_ssn(sentence):
    return re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '', sentence)

def remove_ip_addresses(sentence):
    return re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '', sentence)

def remove_credit_card_numbers(sentence):
    return re.sub(r'\b(?:\d{4}[ -]?){3}\d{4}\b', '', sentence)

# def has_min_alphanumeric_percentage(sentence, min_percentage=50):
#     alphanumeric_count = sum(c.isalnum() for c in sentence)
#     total_count = len(sentence)
#     percentage = (alphanumeric_count / total_count) * 100
#     return percentage >= min_percentage

def remove_repeated_chars(sentence):
    return re.sub(r'(.)\1{3,}', r'\1', sentence)

def fix_encoding(text):
    return ftfy.fix_text(text)

def normalize_whitespace(text):
    text = "".join([char if char not in whitespace else " " for char in text])
    return re.sub(r'\s+', ' ', text).strip()

def has_min_chars(text, min_chars=3):
    return len(text) >= min_chars

def filter_dataset(example):
    text = example["text"]
    text = fix_encoding(text)

    # Check for lorem ipsum and curly brackets in the initial text
    if contains_lorem_ipsum(text) or contains_curly_bracket(text) or not is_english(text):
        return {"text": ""}

    sentences = sent_tokenize(text)

    valid_sentences = []

    for sentence in sentences:
        if (is_terminal_punctuation(sentence) and
            is_valid_sentence(sentence) and
            not contains_javascript(sentence) and
            #not contains_warning(sentence) and
            not contains_phone_number(sentence) and
            not contains_url(sentence) and
            #has_min_alphanumeric_percentage(sentence) and
            has_min_chars(sentence)):
            sentence = remove_ssn(sentence)
            sentence = remove_repeated_chars(sentence)
            sentence = remove_ip_addresses(sentence)
            sentence = remove_credit_card_numbers(sentence)
            sentence = normalize_whitespace(sentence)
            valid_sentences.append(sentence)

    if len(valid_sentences) < 5:
        return {"text": ""}
    else:
        return {"text": "\n".join(valid_sentences)}

if __name__ == "__main__":
    dataset = load_dataset("conceptofmind/test_c4_filters", split="train")

    print(dataset)

    filtered_dataset = dataset.map(
        filter_dataset,
        num_proc=multiprocessing.cpu_count()
    ).filter(is_not_empty)

    print(filtered_dataset)

    filtered_dataset.push_to_hub('uber_clean')

## unit_test.py
import unittest
from datasets import load_dataset
from uber_clean import (
    contains_phone_number,
    contains_javascript,
    contains_lorem_ipsum,
    contains_curly_bracket,
    is_terminal_punctuation,
    is_valid_sentence,
    has_min_chars,
    fix_encoding,
    normalize_whitespace,
    remove_repeated_chars,
    has_min_alphanumeric_percentage,
    is_english,
    remove_ssn,
    remove_credit_card_numbers,
    contains_url,
    remove_ip_addresses,
    filter_dataset
)


class TestFilters(unittest.TestCase):

    def test_contains_lorem_ipsum(self):
        test_cases = [
            ("This is a sentence with lorem ipsum text.", True),
            ("Lorem ipsum dolor sit amet, consectetur adipiscing elit.", True),
            ("This is a normal sentence without it.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(contains_lorem_ipsum(sentence), expected_result)

    def test_contains_curly_bracket(self):
        test_cases = [
            ("This is a sentence with a { curly bracket.", True),
            ("The code snippet is: function() { return true; }", True),
            ("This is a normal sentence without any curly brackets.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(contains_curly_bracket(sentence), expected_result)

    def test_is_terminal_punctuation(self):
        test_cases = [
            ("This sentence ends with a period.", True),
            ("What a great day!", True),
            ("Is this a question?", True),
            ("This sentence does not have terminal punctuation", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(is_terminal_punctuation(sentence), expected_result)

    def test_is_valid_sentence(self):
        test_cases = [
            ("This sentence has at least three words.", True),
            ("Only three words.", True),
            ("One.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(is_valid_sentence(sentence), expected_result)


    def test_contains_phone_number(self):
        test_cases = [
            ("Call me at (123) 456-7890.", True),
            ("My number is +1 (555) 123-4567.", True),
            ("You can reach me at 9876543210.", True),
            ("The temperature is 100F today.", False),
            ("This is a regular sentence without a phone number.", False),
            ("The number you are trying to reach is no longer in service.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(contains_phone_number(sentence), expected_result)


    def test_contains_javascript(self):
        test_cases = [
            ("This page requires JavaScript to run properly.", True),
            ("Please enable Javascript in your browser.", True),
            ("This is a sentence with the word javascript.", True),
            ("This page uses CSS for styling.", False),
            ("This is a regular sentence without the word.", False),
            ("Java and Python are popular programming languages.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(contains_javascript(sentence), expected_result)


    def test_has_min_chars(self):
        test_cases = [
            ("This sentence has more than 20 characters.", True),
            ("This sentence has less.", True)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(has_min_chars(sentence, 20), expected_result)

    def test_fix_encoding(self):
        test_cases = [
            ("This is a normal sentence without any encoding issues.", "This is a normal sentence without any encoding issues."),
            ("This sentence has â€˜smartâ€™ quotes.", "This sentence has 'smart' quotes."),
            ("MÃ¶bius strip is a surface with only one side.", "Möbius strip is a surface with only one side.")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(fix_encoding(input_text), expected_result)

    def test_normalize_whitespace(self):
        test_cases = [
            ("This is a normal sentence without any unusual whitespace.", "This is a normal sentence without any unusual whitespace."),
            ("This sentence has\u2009different\u200aspaces.", "This sentence has different spaces."),
            ("This\u3000sentence\u2002has\u2003wide\u2004spaces.", "This sentence has wide spaces.")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(normalize_whitespace(input_text), expected_result)

    def test_remove_repeated_chars(self):
        test_cases = [
            ("This is a normal sentence without any repeated characters.", "This is a normal sentence without any repeated characters."),
            ("Thiiiiis sentenceeee has soooome repeaaaated chaaaaracters.", "This sentence has some repeated characters."),
            ("AAAAAhhhhhh, I can't believe thisssss!", "Ah, I can't believe this!")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(remove_repeated_chars(input_text), expected_result)

    def test_has_min_alpha_numeric(self):
        test_cases = [
            ("This is a normal sentence with enough alpha numeric characters.", True),
            ("$%#@!&*^", False),
            ("A sentence with 20% alpha numeric characters.", True),
            ("A12_+%# $()?", False)
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(has_min_alphanumeric_percentage(input_text, min_percentage=75), expected_result)

    def test_is_english(self):
        test_cases = [
            ("This is a normal English sentence.", True),
            ("Ceci est une phrase en français.", False),
            ("Dies ist ein Satz auf Deutsch.", False),
            ("Esta es una oración en español.", False)
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(is_english(input_text), expected_result)

    def test_remove_ssn(self):
        test_cases = [
            ("This is a normal sentence without any social security numbers.", "This is a normal sentence without any social security numbers."),
            ("My social security number is 123-45-6789.", "My social security number is ."),
            ("Another SSN is 987-65-4321, please handle it.", "Another SSN is , please handle it.")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(remove_ssn(input_text), expected_result)

    def test_contains_url(self):
        test_cases = [
            ("Visit our website at https://www.example.com.", True),
            ("You can find the article at http://example.org/article.", True),
            ("Check out our blog: www.blog.example.net", True),
            ("My email is john@example.com", False),
            ("The price is $20,000.", False),
            ("This is a regular sentence without a URL.", False)
        ]

        for sentence, expected_result in test_cases:
            self.assertEqual(contains_url(sentence), expected_result)

    def test_remove_credit_card(self):
        test_cases = [
            ("This is a normal sentence without any credit card numbers.", "This is a normal sentence without any credit card numbers."),
            ("My credit card number is 1234-5678-9123-4567.", "My credit card number is ."),
            ("Another credit card number is 9876-5432-1098-7654, please remove it.", "Another credit card number is , please remove it.")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(remove_credit_card_numbers(input_text), expected_result)

    def test_remove_ip(self):
        test_cases = [
            ("This is a normal sentence without any IP addresses.", "This is a normal sentence without any IP addresses."),
            ("The server IP address is 192.168.1.1.", "The server IP address is ."),
            ("Another IP address is 10.0.0.1, please remove it.", "Another IP address is , please remove it.")
        ]

        for input_text, expected_result in test_cases:
            self.assertEqual(remove_ip_addresses(input_text), expected_result)

    def test_filter_dataset(self):
        # Load a sample from the dataset
        dataset = load_dataset("conceptofmind/test_l", split="train")
        example = dataset[0]  # Get the first example from the dataset

        # Apply the filter_dataset function
        filtered_example = filter_dataset(example)

        # Perform assertions to check if the filtered_example is as expected
        # Example: Check if the filtered_example is not empty
        self.assertNotEqual(filtered_example["text"], "")


if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)
	import re
	import nltk
	import ftfy
	import multiprocessing
	from datasets import load_dataset
	from langdetect import detect_langs
	from nltk.tokenize import sent_tokenize, word_tokenize

	nltk.download("punkt")

	whitespace={
	" ",
	" ",
	" ",
	" ",
	" ",
	"　",
	" ",
	" ",
	" ",
	" ",
	"",
	"",
	}

	def is_not_empty(example):
	return len(example['text']) > 0

	def is_terminal_punctuation(line):
	return bool(re.search(r'[\.\?!"]\s*$', line))

	def is_valid_sentence(sentence):
	words = word_tokenize(sentence)
	return len(words) >= 3

	def contains_javascript(sentence):
	return bool(re.search(r'\b(?:java\s*script\|JS)\b', sentence, re.IGNORECASE))

	def contains_lorem_ipsum(sentence):
	return bool(re.search(r'\b(?:lorem\s*ipsum)\b', sentence, re.IGNORECASE))

	def contains_curly_bracket(sentence):
	return bool(re.search(r'[{}]', sentence))

	def is_english(sentence):
	languages = detect_langs(sentence)
	return any(lang.lang == 'en' and lang.prob >= 0.99 for lang in languages)

	# def contains_warning(sentence):
	# return bool(re.search(r'WARNING:\sTHE\sEDGAR\sSYSTEM\sENCOUNTERED\sERROR\(S\)\sWHILE\sPROCESSING\sTHIS\s*SCHEDULE\.', sentence, re.IGNORECASE))

	def contains_url(sentence):
	url_pattern = r'(?:(?:http\|https\|ftp):\/\/\|www\.)[\w/\-?=%.]+\.[\w/\-?=%.]+'
	return bool(re.search(url_pattern, sentence))

	def contains_phone_number(sentence):
	phone_number_pattern = r'\b(?:\+\d{1,3})?[-. (](?:\d{1,3})?[-. )](?:\d{2,5})[-. (](?:\d{2,5})[-. )](?:\d{2,5})\b'
	return bool(re.search(phone_number_pattern, sentence))

	def remove_ssn(sentence):
	return re.sub(r'\b\d{3}-\d{2}-\d{4}\b', '', sentence)

	def remove_ip_addresses(sentence):
	return re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', '', sentence)

	def remove_credit_card_numbers(sentence):
	return re.sub(r'\b(?:\d{4}[ -]?){3}\d{4}\b', '', sentence)

	# def has_min_alphanumeric_percentage(sentence, min_percentage=50):
	# alphanumeric_count = sum(c.isalnum() for c in sentence)
	# total_count = len(sentence)
	# percentage = (alphanumeric_count / total_count) * 100
	# return percentage >= min_percentage

	def remove_repeated_chars(sentence):
	return re.sub(r'(.)\1{3,}', r'\1', sentence)

	def fix_encoding(text):
	return ftfy.fix_text(text)

	def normalize_whitespace(text):
	text = "".join([char if char not in whitespace else " " for char in text])
	return re.sub(r'\s+', ' ', text).strip()

	def has_min_chars(text, min_chars=3):
	return len(text) >= min_chars

	def filter_dataset(example):
	text = example["text"]
	text = fix_encoding(text)

	# Check for lorem ipsum and curly brackets in the initial text
	if contains_lorem_ipsum(text) or contains_curly_bracket(text) or not is_english(text):
	return {"text": ""}

	sentences = sent_tokenize(text)

	valid_sentences = []

	for sentence in sentences:
	if (is_terminal_punctuation(sentence) and
	is_valid_sentence(sentence) and
	not contains_javascript(sentence) and
	#not contains_warning(sentence) and
	not contains_phone_number(sentence) and
	not contains_url(sentence) and
	#has_min_alphanumeric_percentage(sentence) and
	has_min_chars(sentence)):
	sentence = remove_ssn(sentence)
	sentence = remove_repeated_chars(sentence)
	sentence = remove_ip_addresses(sentence)
	sentence = remove_credit_card_numbers(sentence)
	sentence = normalize_whitespace(sentence)
	valid_sentences.append(sentence)

	if len(valid_sentences) < 5:
	return {"text": ""}
	else:
	return {"text": "\n".join(valid_sentences)}

	if __name__ == "__main__":
	dataset = load_dataset("conceptofmind/test_c4_filters", split="train")

	print(dataset)

	filtered_dataset = dataset.map(
	filter_dataset,
	num_proc=multiprocessing.cpu_count()
	).filter(is_not_empty)

	print(filtered_dataset)

	filtered_dataset.push_to_hub('uber_clean')
	import unittest
	from datasets import load_dataset
	from uber_clean import (
	contains_phone_number,
	contains_javascript,
	contains_lorem_ipsum,
	contains_curly_bracket,
	is_terminal_punctuation,
	is_valid_sentence,
	has_min_chars,
	fix_encoding,
	normalize_whitespace,
	remove_repeated_chars,
	has_min_alphanumeric_percentage,
	is_english,
	remove_ssn,
	remove_credit_card_numbers,
	contains_url,
	remove_ip_addresses,
	filter_dataset
	)


	class TestFilters(unittest.TestCase):

	def test_contains_lorem_ipsum(self):
	test_cases = [
	("This is a sentence with lorem ipsum text.", True),
	("Lorem ipsum dolor sit amet, consectetur adipiscing elit.", True),
	("This is a normal sentence without it.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(contains_lorem_ipsum(sentence), expected_result)

	def test_contains_curly_bracket(self):
	test_cases = [
	("This is a sentence with a { curly bracket.", True),
	("The code snippet is: function() { return true; }", True),
	("This is a normal sentence without any curly brackets.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(contains_curly_bracket(sentence), expected_result)

	def test_is_terminal_punctuation(self):
	test_cases = [
	("This sentence ends with a period.", True),
	("What a great day!", True),
	("Is this a question?", True),
	("This sentence does not have terminal punctuation", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(is_terminal_punctuation(sentence), expected_result)

	def test_is_valid_sentence(self):
	test_cases = [
	("This sentence has at least three words.", True),
	("Only three words.", True),
	("One.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(is_valid_sentence(sentence), expected_result)


	def test_contains_phone_number(self):
	test_cases = [
	("Call me at (123) 456-7890.", True),
	("My number is +1 (555) 123-4567.", True),
	("You can reach me at 9876543210.", True),
	("The temperature is 100F today.", False),
	("This is a regular sentence without a phone number.", False),
	("The number you are trying to reach is no longer in service.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(contains_phone_number(sentence), expected_result)


	def test_contains_javascript(self):
	test_cases = [
	("This page requires JavaScript to run properly.", True),
	("Please enable Javascript in your browser.", True),
	("This is a sentence with the word javascript.", True),
	("This page uses CSS for styling.", False),
	("This is a regular sentence without the word.", False),
	("Java and Python are popular programming languages.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(contains_javascript(sentence), expected_result)


	def test_has_min_chars(self):
	test_cases = [
	("This sentence has more than 20 characters.", True),
	("This sentence has less.", True)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(has_min_chars(sentence, 20), expected_result)

	def test_fix_encoding(self):
	test_cases = [
	("This is a normal sentence without any encoding issues.", "This is a normal sentence without any encoding issues."),
	("This sentence has â€˜smartâ€™ quotes.", "This sentence has 'smart' quotes."),
	("MÃ¶bius strip is a surface with only one side.", "Möbius strip is a surface with only one side.")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(fix_encoding(input_text), expected_result)

	def test_normalize_whitespace(self):
	test_cases = [
	("This is a normal sentence without any unusual whitespace.", "This is a normal sentence without any unusual whitespace."),
	("This sentence has\u2009different\u200aspaces.", "This sentence has different spaces."),
	("This\u3000sentence\u2002has\u2003wide\u2004spaces.", "This sentence has wide spaces.")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(normalize_whitespace(input_text), expected_result)

	def test_remove_repeated_chars(self):
	test_cases = [
	("This is a normal sentence without any repeated characters.", "This is a normal sentence without any repeated characters."),
	("Thiiiiis sentenceeee has soooome repeaaaated chaaaaracters.", "This sentence has some repeated characters."),
	("AAAAAhhhhhh, I can't believe thisssss!", "Ah, I can't believe this!")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(remove_repeated_chars(input_text), expected_result)

	def test_has_min_alpha_numeric(self):
	test_cases = [
	("This is a normal sentence with enough alpha numeric characters.", True),
	("$%#@!&*^", False),
	("A sentence with 20% alpha numeric characters.", True),
	("A12_+%# $()?", False)
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(has_min_alphanumeric_percentage(input_text, min_percentage=75), expected_result)

	def test_is_english(self):
	test_cases = [
	("This is a normal English sentence.", True),
	("Ceci est une phrase en français.", False),
	("Dies ist ein Satz auf Deutsch.", False),
	("Esta es una oración en español.", False)
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(is_english(input_text), expected_result)

	def test_remove_ssn(self):
	test_cases = [
	("This is a normal sentence without any social security numbers.", "This is a normal sentence without any social security numbers."),
	("My social security number is 123-45-6789.", "My social security number is ."),
	("Another SSN is 987-65-4321, please handle it.", "Another SSN is , please handle it.")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(remove_ssn(input_text), expected_result)

	def test_contains_url(self):
	test_cases = [
	("Visit our website at https://www.example.com.", True),
	("You can find the article at http://example.org/article.", True),
	("Check out our blog: www.blog.example.net", True),
	("My email is john@example.com", False),
	("The price is $20,000.", False),
	("This is a regular sentence without a URL.", False)
	]

	for sentence, expected_result in test_cases:
	self.assertEqual(contains_url(sentence), expected_result)

	def test_remove_credit_card(self):
	test_cases = [
	("This is a normal sentence without any credit card numbers.", "This is a normal sentence without any credit card numbers."),
	("My credit card number is 1234-5678-9123-4567.", "My credit card number is ."),
	("Another credit card number is 9876-5432-1098-7654, please remove it.", "Another credit card number is , please remove it.")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(remove_credit_card_numbers(input_text), expected_result)

	def test_remove_ip(self):
	test_cases = [
	("This is a normal sentence without any IP addresses.", "This is a normal sentence without any IP addresses."),
	("The server IP address is 192.168.1.1.", "The server IP address is ."),
	("Another IP address is 10.0.0.1, please remove it.", "Another IP address is , please remove it.")
	]

	for input_text, expected_result in test_cases:
	self.assertEqual(remove_ip_addresses(input_text), expected_result)

	def test_filter_dataset(self):
	# Load a sample from the dataset
	dataset = load_dataset("conceptofmind/test_l", split="train")
	example = dataset[0] # Get the first example from the dataset

	# Apply the filter_dataset function
	filtered_example = filter_dataset(example)

	# Perform assertions to check if the filtered_example is as expected
	# Example: Check if the filtered_example is not empty
	self.assertNotEqual(filtered_example["text"], "")



	if __name__ == '__main__':
	unittest.main(argv=['first-arg-is-ignored'], exit=False)