Skip to content

Instantly share code, notes, and snippets.

@wogsland
Forked from sandys/clean_csv.rb
Created January 20, 2013 17:10
Show Gist options
  • Save wogsland/4579914 to your computer and use it in GitHub Desktop.
Save wogsland/4579914 to your computer and use it in GitHub Desktop.
require 'fastercsv'
require 'csv'
require 'net/http'
require 'sanitize'
#CSV_FILE_PATH = File.join(File.dirname(__FILE__), 'c2i.csv')
CSV_FILE_PATH = File.join('/tmp', 'Computers_Accessories_Master.csv')
=begin
FasterCSV.foreach(CSV_FILE_PATH, {:headers => true, :col_sep => "@@"}) do |line|
puts line
end
=end
uri = URI.parse('http://wordoff.org/api/clean')
=begin
CSV.foreach(CSV_FILE_PATH, {:headers => true, :col_sep => "@"}) do |line|
puts line["Product Description*"]
req = Net::HTTP::Post.new(uri.path)
req.set_form_data({'html' => line["Product Description*"]})
http = Net::HTTP.new(uri.host, uri.port)
resp = http.request(req)
puts resp.body
end
=end
#csv_in = CSV.new(data, :headers => true)
csv_in = CSV.read(CSV_FILE_PATH, {:headers => true, :col_sep => ","})
CSV.open('/tmp/temp2.csv', 'w', {:col_sep => "|", :quote_char => '\'', :force_quotes => true}) do |csv_out|
csv_out << csv_in.first.headers
#csv_in.rewind
csv_in.each do |line|
#puts line["Product Description*"]
begin
clean = Sanitize.clean(line["Product Description*"] , :elements => ['a', 'b', 'blockquote', 'br', 'em', 'i', 'img', 'li', 'ol', 'p', 'span', 'strong', 'ul', 'table', 'td' , 'tr'], :add_attributes => { 'table' => {"width" => "100%", "border" => "1",
"bordercolor" => "#dfdede", "cellpadding" => "0", "cellspacing" => "0"} } )
end
#clean = Sanitize.clean(line["Product Description*"] , Sanitize::Config::RELAXED)
#clean.gsub! /"/, ''
clean.gsub! /\n/, ''
line["Product Description*"] = clean
csv_out << line.fields
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment