Created
September 4, 2010 19:07
-
-
Save andresgutgon/565410 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://en.wikipedia.org/wiki/Stop_words | |
es_STOPWORDS = [ | |
'de','a','que','en','para', | |
'por','le','la','lo','las','los','el', | |
'una','un','cada' | |
] | |
en_STOPWORDS = [ | |
'I','a','about','an','are', | |
'as','at','by','com','for', | |
'from','how','of', | |
'on','or','that','the','this','to','was', | |
'what','when','where','who','will','with','the','www' | |
] | |
ca_STOPWORDS = [ | |
'de','es','i','a','o','un','una','unes','uns', | |
'un','tot','també','altre','algun','alguna','alguns', | |
'algunes','ser','és','soc','ets','som','estic', | |
'està','estem','esteu','estan','com','en','per','perquè', | |
'per que','estat','estava','ans','abans','éssent','ambdós', | |
'però','per','poder','potser','puc','podem','podeu','poden', | |
'vaig','va','van','fer faig','fa','fem','feu','fan','cada','fi','inclòs','primer','des de','anar', | |
'haver','tenir','tinc','te','tenim','teniu','tene','el','la','les','els','seu','aquí','meu','teu', | |
'ells','elles','ens','si','dins','sols','solament','saber','saps','sap','sabem','sabeu','saben', | |
'últim','llarg','bastant fas','molts','seus','llavors','sota','dalt','ús','molt','era','eres', | |
'erem','eren','mode','bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell' | |
] | |
STOPWORDS = [] | |
STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Abrir la clase string para meter un metodo que necesito para cortar y sanear strings | |
# Esto lo he metido en una app rails en config/initializers/open_string_class.rb | |
# I love Ruby! | |
# Fuentes para hacer truncate_and_dasherize | |
# [1] Este mensaje en la lista de Rails | |
# [2] Este post de Fenando Guillen. Gracias | |
# [3] Este plugin. Le he copiado cositas :) Thank you! | |
# [4] La pagina de UNICODE | |
# [1] http://lists.simplelogica.net/pipermail/ror-es/2010-September/024082.html | |
# [2] http://www.fernandoguillen.info/es/blog/2008/08/30/ruby-sanitizando-tus-titulos-en-2-lineas/ | |
# [3] http://github.com/ludo/to_slug/blob/master/lib/to_slug.rb | |
# [4] http://www.unicode.org/reports/tr15/#Norm_Forms | |
class String | |
# http://en.wikipedia.org/wiki/Stop_words | |
es_STOPWORDS = [ | |
'de','a','que','en','para', | |
'por','le','la','lo','las','los','el', | |
'una','un','cada' | |
] | |
en_STOPWORDS = [ | |
'I','a','about','an','are', | |
'as','at','by','com','for', | |
'from','how','of', | |
'on','or','that','the','this','to','was', | |
'what','when','where','who','will','with','the','www' | |
] | |
ca_STOPWORDS = [ | |
'de','es','i','a','o','un','tot','altre','algun','alguna','alguns', | |
'algunes','ser','és','soc','ets','som','estic', | |
'està','estem','esteu','estan','com','en','per','perquè', | |
'però','per','el','la','les','els','seu','aquí','meu','teu', | |
'ells','elles','ens','si','dins','sols','seus','llavors','sota','dalt','ús','molt', | |
'bé','quant','quan','on','mentre','qui','amb','entre','sense','jo','aquell' | |
] | |
STOPWORDS = [] | |
STOPWORDS.concat(es_STOPWORDS).concat(en_STOPWORDS).concat(ca_STOPWORDS) | |
def truncate_slugize(sep="-",length = 15, drop_stopwords = true) | |
raise "Word separator must be one of - _ +" unless %w[- _ +].include?(sep) | |
return "" if self.length == 0 | |
# Perform transliteration to replace non-ascii characters with an ascii character | |
value = self.mb_chars.normalize(:kd).gsub(/[^\x00-\x7F]/n, '').to_s | |
# Remove single quotes from input | |
value.gsub!(/[']+/, '') | |
# Replace any non-word character (\W) with a space | |
value.gsub!(/\W+/, ' ') | |
# Remove any whitespace before and after the string | |
value.downcase! | |
# stopwords | |
if drop_stopwords | |
STOPWORDS.each do |stopword| | |
value.gsub!( /\s+#{stopword}\s+|\A#{stopword}\s+|\s+#{stopword}\z/, ' ' ) | |
end | |
end | |
re_sep = Regexp.escape(sep) | |
{ ['á','à','â','ä','ã'] => 'a', | |
['e','é','è','ê','ë'] => 'e', | |
['í','ì','î','ï'] => 'i', | |
['ó','ò','ô','ö','õ'] => 'o', | |
['ú','ù','û','ü'] => 'u' | |
}.each { |ac, rep| ac.each { |s| value.gsub!(s,rep) } } | |
value.gsub!(/[ñ]/, 'n') | |
value.gsub!(/[ç]/, 'c') | |
# Lo acortamos a esta longitud | |
value = value[0..length-1] | |
value.strip! | |
# Replace spaces for #{sep} | |
value = value.gsub(/[^A-Za-z0-9]/i, "#{sep}").gsub(/\A#{sep}+/,'').gsub(/#{sep}+\z/,'') | |
value.gsub!(/#{re_sep}{2,}/, sep) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment