andresgutgon/STOPWORDS

## STOPWORDS
# http://en.wikipedia.org/wiki/Stop_words
  es_STOPWORDS = [
    'de','a','que','en','para',
    'por','le','la','lo','las','los','el',
    'una','un','cada'
  ]
  en_STOPWORDS = [
    'I','a','about','an','are',
    'as','at','by','com','for',
    'from','how','of',
    'on','or','that','the','this','to','was',
    'what','when','where','who','will','with','the','www'
  ]
  ca_STOPWORDS = [
    'de','es','i','a','o','un','una','unes','uns',
    'un','tot','també','altre','algun','alguna','alguns',
    'algunes','ser','és','soc','ets','som','estic',
    'està','estem','esteu','estan','com','en','per','perquè',
    'per que','estat','estava','ans','abans','éssent','ambdós',
    'però','per','poder','potser','puc','podem','podeu','poden',
    'vaig','va','van','fer 	faig','fa','fem','feu','fan','cada','fi','inclòs','primer','des de','anar',
    'haver','tenir','tinc','te','tenim','teniu','tene','el','la','les','els','seu','aquí','meu','teu',
    'ells','elles','ens','si','dins','sols','solament','saber','saps','sap','sabem','sabeu','saben',
    'últim','llarg','bastant fas','molts','seus','llavors','sota','dalt','ús','molt','era','eres',
    'erem','eren','mode','bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell'
  ]
  STOPWORDS = []
  STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS)

## truncate_slugize
# Abrir la clase string para meter un metodo que necesito para cortar y sanear strings
# Esto lo he metido en una app rails en config/initializers/open_string_class.rb
# I love Ruby!
# Fuentes para hacer truncate_and_dasherize
 # [1] Este mensaje en la lista de Rails
 # [2] Este post de Fenando Guillen. Gracias
 # [3] Este plugin. Le he copiado cositas :) Thank you!
 # [4] La pagina de UNICODE

 # [1] http://lists.simplelogica.net/pipermail/ror-es/2010-September/024082.html
 # [2] http://www.fernandoguillen.info/es/blog/2008/08/30/ruby-sanitizando-tus-titulos-en-2-lineas/
 # [3] http://github.com/ludo/to_slug/blob/master/lib/to_slug.rb
 # [4] http://www.unicode.org/reports/tr15/#Norm_Forms
class String
# http://en.wikipedia.org/wiki/Stop_words
  es_STOPWORDS = [
    'de','a','que','en','para',
    'por','le','la','lo','las','los','el',
    'una','un','cada'
  ]
  en_STOPWORDS = [
    'I','a','about','an','are',
    'as','at','by','com','for',
    'from','how','of',
    'on','or','that','the','this','to','was',
    'what','when','where','who','will','with','the','www'
  ]
  ca_STOPWORDS = [
    'de','es','i','a','o','un','tot','altre','algun','alguna','alguns',
    'algunes','ser','és','soc','ets','som','estic',
    'està','estem','esteu','estan','com','en','per','perquè',
    'però','per','el','la','les','els','seu','aquí','meu','teu',
    'ells','elles','ens','si','dins','sols','seus','llavors','sota','dalt','ús','molt',
    'bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell'
  ]
  STOPWORDS = []
  STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS)

  def truncate_slugize(sep="-",length = 15, drop_stopwords = true)
    raise "Word separator must be one of - _ +" unless %w[- _ +].include?(sep)
    return "" if self.length == 0

    # Perform transliteration to replace non-ascii characters with an ascii character
    value = self.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/n, '').to_s

    # Remove single quotes from input
    value.gsub!(/[']+/, '')
    # Replace any non-word character (\W) with a space
    value.gsub!(/\W+/, ' ')
    # Remove any whitespace before and after the string
    value.downcase!

    # stopwords
    if drop_stopwords
     STOPWORDS.each do |stopword|
       value.gsub!( /\s+#{stopword}\s+|\A#{stopword}\s+|\s+#{stopword}\z/, ' ' )
     end
    end

    re_sep = Regexp.escape(sep)
    { ['á','à','â','ä','ã'] => 'a',
     ['e','é','è','ê','ë'] => 'e',
     ['í','ì','î','ï'] => 'i',
     ['ó','ò','ô','ö','õ'] => 'o',
     ['ú','ù','û','ü'] => 'u'
    }.each { |ac, rep| ac.each { |s| value.gsub!(s,rep) } }
    value.gsub!(/[ñ]/, 'n')
    value.gsub!(/[ç]/, 'c')

    # Lo acortamos a esta longitud
    value = value[0..length-1]
    value.strip!

    # Replace spaces for #{sep}
    value = value.gsub(/[^A-Za-z0-9]/i, "#{sep}").gsub(/\A#{sep}+/,'').gsub(/#{sep}+\z/,'')
    value.gsub!(/#{re_sep}{2,}/, sep)
  end
end
	# http://en.wikipedia.org/wiki/Stop_words
	es_STOPWORDS = [
	'de','a','que','en','para',
	'por','le','la','lo','las','los','el',
	'una','un','cada'
	]
	en_STOPWORDS = [
	'I','a','about','an','are',
	'as','at','by','com','for',
	'from','how','of',
	'on','or','that','the','this','to','was',
	'what','when','where','who','will','with','the','www'
	]
	ca_STOPWORDS = [
	'de','es','i','a','o','un','una','unes','uns',
	'un','tot','també','altre','algun','alguna','alguns',
	'algunes','ser','és','soc','ets','som','estic',
	'està','estem','esteu','estan','com','en','per','perquè',
	'per que','estat','estava','ans','abans','éssent','ambdós',
	'però','per','poder','potser','puc','podem','podeu','poden',
	'vaig','va','van','fer faig','fa','fem','feu','fan','cada','fi','inclòs','primer','des de','anar',
	'haver','tenir','tinc','te','tenim','teniu','tene','el','la','les','els','seu','aquí','meu','teu',
	'ells','elles','ens','si','dins','sols','solament','saber','saps','sap','sabem','sabeu','saben',
	'últim','llarg','bastant fas','molts','seus','llavors','sota','dalt','ús','molt','era','eres',
	'erem','eren','mode','bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell'
	]
	STOPWORDS = []
	STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS)
	# Abrir la clase string para meter un metodo que necesito para cortar y sanear strings
	# Esto lo he metido en una app rails en config/initializers/open_string_class.rb
	# I love Ruby!
	# Fuentes para hacer truncate_and_dasherize
	# [1] Este mensaje en la lista de Rails
	# [2] Este post de Fenando Guillen. Gracias
	# [3] Este plugin. Le he copiado cositas :) Thank you!
	# [4] La pagina de UNICODE

	# [1] http://lists.simplelogica.net/pipermail/ror-es/2010-September/024082.html
	# [2] http://www.fernandoguillen.info/es/blog/2008/08/30/ruby-sanitizando-tus-titulos-en-2-lineas/
	# [3] http://github.com/ludo/to_slug/blob/master/lib/to_slug.rb
	# [4] http://www.unicode.org/reports/tr15/#Norm_Forms
	class String
	# http://en.wikipedia.org/wiki/Stop_words
	es_STOPWORDS = [
	'de','a','que','en','para',
	'por','le','la','lo','las','los','el',
	'una','un','cada'
	]
	en_STOPWORDS = [
	'I','a','about','an','are',
	'as','at','by','com','for',
	'from','how','of',
	'on','or','that','the','this','to','was',
	'what','when','where','who','will','with','the','www'
	]
	ca_STOPWORDS = [
	'de','es','i','a','o','un','tot','altre','algun','alguna','alguns',
	'algunes','ser','és','soc','ets','som','estic',
	'està','estem','esteu','estan','com','en','per','perquè',
	'però','per','el','la','les','els','seu','aquí','meu','teu',
	'ells','elles','ens','si','dins','sols','seus','llavors','sota','dalt','ús','molt',
	'bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell'
	]
	STOPWORDS = []
	STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS)

	def truncate_slugize(sep="-",length = 15, drop_stopwords = true)
	raise "Word separator must be one of - _ +" unless %w[- _ +].include?(sep)
	return "" if self.length == 0

	# Perform transliteration to replace non-ascii characters with an ascii character
	value = self.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/n, '').to_s

	# Remove single quotes from input
	value.gsub!(/[']+/, '')
	# Replace any non-word character (\W) with a space
	value.gsub!(/\W+/, ' ')
	# Remove any whitespace before and after the string
	value.downcase!

	# stopwords
	if drop_stopwords
	STOPWORDS.each do \|stopword\|
	value.gsub!( /\s+#{stopword}\s+\|\A#{stopword}\s+\|\s+#{stopword}\z/, ' ' )
	end
	end

	re_sep = Regexp.escape(sep)
	{ ['á','à','â','ä','ã'] => 'a',
	['e','é','è','ê','ë'] => 'e',
	['í','ì','î','ï'] => 'i',
	['ó','ò','ô','ö','õ'] => 'o',
	['ú','ù','û','ü'] => 'u'
	}.each { \|ac, rep\| ac.each { \|s\| value.gsub!(s,rep) } }
	value.gsub!(/[ñ]/, 'n')
	value.gsub!(/[ç]/, 'c')

	# Lo acortamos a esta longitud
	value = value[0..length-1]
	value.strip!

	# Replace spaces for #{sep}
	value = value.gsub(/[^A-Za-z0-9]/i, "#{sep}").gsub(/\A#{sep}+/,'').gsub(/#{sep}+\z/,'')
	value.gsub!(/#{re_sep}{2,}/, sep)
	end
	end