|
-- Function strip_html(cSucio, to_csv): clob |
|
-- |
|
-- Motor de base de datos: Oracle 10g |
|
-- |
|
-- Parámetros: |
|
-- cSucia (clob): cadena a tratar. |
|
-- to_csv (number): |
|
-- 0 = elimina tags html (todo lo encerrado entre <>) y decodifica entidades html. |
|
-- 1 = idem 0 + prepara para exportación a csv saneando comillas dobles, simples, |
|
-- retornos de carro y puntos y coma. |
|
-- 2 = elimina tags html determinados, no todo lo encerrado entre <>. No decodifica entidades html ni sanea comillas dobles. |
|
|
|
CREATE OR REPLACE FUNCTION strip_html(cSucia IN clob, |
|
to_csv IN NUMBER DEFAULT 1) |
|
RETURN clob IS OUT clob ; |
|
|
|
TYPE arr_string IS varray (200) OF VARCHAR2(64); |
|
|
|
entities_search_for arr_string; |
|
entities_replace arr_string; |
|
cont NUMBER; |
|
|
|
BEGIN |
|
-- Si la cadena es NULL o de largo 0 la devuelve tal como entró. |
|
IF cSucia IS NULL THEN |
|
RETURN cSucia; |
|
END IF; |
|
|
|
IF LENGTH(cSucia) = 0 THEN |
|
RETURN cSucia; |
|
END IF; |
|
|
|
OUT := cSucia; |
|
|
|
--Array de entities. |
|
entities_search_for := arr_string( |
|
'!', |
|
'#', |
|
'$', |
|
'%', |
|
'&', |
|
'"', |
|
'(', |
|
')', |
|
'*', |
|
'+', |
|
',', |
|
'‐', |
|
'.', |
|
'/', |
|
':', |
|
';', |
|
'<', |
|
'=', |
|
'>', |
|
'?', |
|
'@', |
|
'[', |
|
'\', |
|
']', |
|
'ˆ', |
|
'_', |
|
'`', |
|
'{', |
|
'|', |
|
'}', |
|
'˜', |
|
' ', |
|
'¡', |
|
'¢', |
|
'£', |
|
'¤', |
|
'¥', |
|
'¦', |
|
'§', |
|
'¨', |
|
'©', |
|
'ª', |
|
'«', |
|
'¬', |
|
'­', |
|
'®', |
|
'¯', |
|
'°', |
|
'±', |
|
'²', |
|
'³', |
|
'´', |
|
'µ', |
|
'¶', |
|
'·', |
|
'¸', |
|
'¹', |
|
'º', |
|
'»', |
|
'&fr;', |
|
'&fr;', |
|
'&fr;', |
|
'¿', |
|
'À', |
|
'Á', |
|
'Â', |
|
'Ã', |
|
'Ä', |
|
'Å', |
|
'Æ', |
|
'&il;', |
|
'È', |
|
'É', |
|
'Ê', |
|
'Ë', |
|
'Ì', |
|
'Í', |
|
'Î', |
|
'Ï', |
|
'Ð', |
|
'Ñ', |
|
'Ò', |
|
'Ó', |
|
'Ô', |
|
'Õ', |
|
'Ö', |
|
'×', |
|
'Ø', |
|
'Ù', |
|
'Ú', |
|
'Û', |
|
'Ü', |
|
'Ý', |
|
'Þ', |
|
'ß', |
|
'à', |
|
'á', |
|
'â', |
|
'ã', |
|
'ä', |
|
'è', |
|
'é', |
|
'ê', |
|
'&etilde;', |
|
'ë', |
|
'ì', |
|
'í', |
|
'î', |
|
'ĩ', |
|
'ï', |
|
'ñ', |
|
'ò', |
|
'ó', |
|
'ô', |
|
'õ', |
|
'ö', |
|
'ù', |
|
'ú', |
|
'û', |
|
'ũ', |
|
'ü'); |
|
|
|
--array de caracteres. |
|
entities_replace := arr_string( |
|
'¡', |
|
'º', |
|
'$', |
|
'%', |
|
'&', |
|
'"', |
|
'(', |
|
')', |
|
'*', |
|
'+', |
|
',', |
|
'-', |
|
'.', |
|
'Sol', |
|
'Colon', |
|
'*', |
|
'<', |
|
'=', |
|
'>', |
|
'?', |
|
',', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'_', |
|
'''', |
|
'*', |
|
'*', |
|
'*', |
|
'''', |
|
' ', |
|
'¡', |
|
'cent', |
|
'L', |
|
'*', |
|
'Y', |
|
'*', |
|
'*', |
|
'.', |
|
'(c)', |
|
'*', |
|
'*', |
|
'!', |
|
'*', |
|
'(r)', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'á', |
|
'u', |
|
'*', |
|
'·', |
|
'ç', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'¿', |
|
'È', |
|
'Á', |
|
'Ä', |
|
'Á', |
|
'*', |
|
'*', |
|
'AE', |
|
'*', |
|
'È', |
|
'É', |
|
'*', |
|
'*', |
|
'Ì', |
|
'Í', |
|
'Î', |
|
'*', |
|
'*', |
|
'Ñ', |
|
'Ò', |
|
'Ó', |
|
'Ô', |
|
'O', |
|
'*', |
|
'*', |
|
'O', |
|
'Ù', |
|
'Ú', |
|
'Û', |
|
'*', |
|
'*', |
|
'*', |
|
'*', |
|
'à', |
|
'á', |
|
'â', |
|
'a', |
|
'*', |
|
'è', |
|
'é', |
|
'ê', |
|
'e', |
|
'*', |
|
'ì', |
|
'í', |
|
'î', |
|
'i', |
|
'*', |
|
'ñ', |
|
'ò', |
|
'ó', |
|
'ô', |
|
'o', |
|
'*', |
|
'ù', |
|
'ú', |
|
'û', |
|
'u', |
|
'*'); |
|
|
|
-- Reemplaza lo que está entre <xml> y </xml> |
|
-- *? -> lazy star (toma lo mínimo posible) |
|
OUT := regexp_replace(OUT, '<xml>.*?</xml>', '', 1, 0, 'ni' ); |
|
-- Limpia lo que esté dentro de etiquetas <style></style> |
|
OUT := regexp_replace(OUT, '<style>.*?</style>', '', 1, 0, 'ni' ); |
|
|
|
IF to_csv = 2 THEN |
|
-- Sanitiza (no limpia) el html |
|
|
|
-- Limpia el tag <?xml:whatever> |
|
OUT := regexp_replace(OUT, '<\?xml:.*?>', '', 1, 0, 'ni'); |
|
-- Limpia los tags <img *> |
|
OUT := regexp_replace(OUT, '<img.*?>', '', 1, 0, 'ni'); |
|
-- Limpia comentarios |
|
OUT := regexp_replace(OUT,'<!--.*?-->','', 1, 0, 'ni'); |
|
-- Limpia tag <meta> |
|
OUT := regexp_replace(OUT,'<meta.*?>','', 1, 0, 'ni'); |
|
-- Limpia <link> |
|
OUT := regexp_replace(OUT,'<link.*?>','', 1, 0, 'ni'); |
|
-- Limpia <DIV> |
|
OUT := regexp_replace(OUT,'</?div.*?>','', 1, 0, 'ni'); |
|
-- Limpia <SPAN> |
|
OUT := regexp_replace(OUT,'</?span.*?>','', 1, 0, 'ni'); |
|
-- Limpia class dentro de tags |
|
OUT := regexp_replace(OUT,'(<.*?)class="?[a-zA-Z0-9-_]*"?(.*?>)', '\1\2', 1, 0, 'ni'); |
|
-- Limpia "style" dentro de tags <i> <b> <p> |
|
OUT := regexp_replace(OUT,'(<[ibp] .*?)style=".*?"(.*?>)', '\1\2', 1, 0, 'ni'); |
|
-- Limpia namespaces <o:p> </o:p> |
|
OUT := regexp_replace(OUT, '(<)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni'); |
|
OUT := regexp_replace(OUT, '(</)[a-zA-Z0-9-_]*:(.*?>)', '\1\2', 1, 0, 'ni'); |
|
|
|
-- Limpia tags de apertura y de cierre vacíos. |
|
-- Tiene que ejecutarse varias veces para limpiar cosas como <strong><u></u></strong> |
|
|
|
-- TWEAK: <p></p> debe ser reemplazado por <br/> |
|
OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni'); |
|
OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni'); |
|
-- TWEAK: <p></p> debe ser reemplazado por <br/> |
|
OUT := regexp_replace(OUT,'<p></p>','<br/>', 1, 0, 'ni'); |
|
OUT := regexp_replace(OUT,'<([a-zA-Z0-9-_]*)></\1>','', 1, 0, 'ni'); |
|
ELSE |
|
-- Limpia html. |
|
|
|
-- Reemplaza todo lo que sea parecido a un retorno de carro. |
|
OUT := regexp_replace(OUT, '</p[^>]*>',CHR(10)||CHR(13)); |
|
OUT := regexp_replace(OUT, '</br[^>]*>',CHR(10)||CHR(13)); |
|
OUT := regexp_replace(OUT, '</tr[^>]*>',CHR(10)||CHR(13)); |
|
|
|
--Reemplaza los tags html restantes. |
|
OUT := regexp_replace(OUT,'<[^>]*>','', 1, 0, 'ni'); |
|
|
|
-- Reemplaza HTML entities. |
|
FOR cont IN 1..119 LOOP |
|
OUT := REPLACE( OUT, entities_search_for(cont), entities_replace(cont) ); |
|
END LOOP; |
|
|
|
-- Limpia para exportar a CSV. |
|
IF to_csv = 1 THEN |
|
OUT := REPLACE( OUT, CHR(10), '' ); |
|
OUT := REPLACE( OUT, CHR(13), '' ); |
|
OUT := REPLACE( OUT, CHR(9), '' ); |
|
OUT := REPLACE( OUT, ';', ',' ); |
|
OUT := REPLACE( OUT, '"', '''' ); |
|
END IF; |
|
END IF; |
|
|
|
RETURN(OUT); |
|
END strip_html; |