-
-
Save slowkow/7a7f61f495e3dbb7e3d767f97bd7304b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
""" | |
Remove emoji from a text file and print it to stdout. | |
Usage | |
----- | |
python remove-emoji.py input.txt > output.txt | |
""" | |
import re | |
import sys | |
# https://stackoverflow.com/a/49146722/330558 | |
def remove_emoji(string): | |
emoji_pattern = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
"]+", flags=re.UNICODE) | |
return emoji_pattern.sub(r'', string) | |
if __name__ == '__main__': | |
text = open(sys.argv[1]).read() | |
text = remove_emoji(text) | |
print(text) |
i have some text line like this:
🟥凯发🟥凯发🟥
🟥凯时🟥凯时🟥
🟥和记🟥和记🟥
🟥美国🟥中国🟥
and want split words to be like this:
🟥
凯发
🟥
凯发
🟥
🟥
凯时
🟥
凯时
🟥
🟥
和记
🟥
和记
🟥
🟥
美国
🟥
中国
🟥
and one remove duplicate happened on end of process and output will be
🟥
凯发
凯发
凯时
凯时
和记
和记
美国
中国
someone can help me?
```
def remove_emoji(inputString):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', inputString)
do you hate fun?
This special characters corrupted my HDD. Where is fun in that?
figured I'd leave my solution here as this thread helped me, heres a ruby script that cleans all CSVs in the passed in directory:
ruby script_name.rb path/to/csvs/
#!/usr/bin/env ruby
require 'csv'
require 'fileutils'
def remove_emoji(string)
emoji_pattern = /[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2500}-\u{2BEF}\u{2702}-\u{27B0}\u{2702}-\u{27B0}\u{24C2}-\u{1F251}\u{1f926}-\u{1f937}\u{10000}-\u{10ffff}\u{2640}-\u{2642}\u{2600}-\u{2B55}\u{200d}\u{23cf}\u{23e9}\u{231a}\u{fe0f}\u{3030}]+/
string.gsub(emoji_pattern, '')
end
def process_csv(input_file, cleaned_dir)
csv_data = CSV.read(input_file, headers: true, encoding: 'utf-8')
cleaned_headers = csv_data.headers.map { |header| remove_emoji(header) }
FileUtils.mkdir_p(cleaned_dir)
output_file = File.join(cleaned_dir, File.basename(input_file))
CSV.open(output_file, "w", write_headers: true, headers: cleaned_headers) do |csv|
csv_data.each do |row|
csv << row
end
end
puts "Cleaned emojis and saved to #{output_file}"
end
def process_directory(directory)
cleaned_dir = File.join(directory, 'cleaned_csvs')
Dir.glob(File.join(directory, '*.csv')).each do |file_path|
process_csv(file_path, cleaned_dir)
end
end
directory_path = ARGV[0]
process_directory(directory_path)
Не плохо!
no work