Skip to content

Instantly share code, notes, and snippets.

@vshabanov
Created January 10, 2012 08:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vshabanov/1587842 to your computer and use it in GitHub Desktop.
Save vshabanov/1587842 to your computer and use it in GitHub Desktop.
Более шустрый парсер для tagsoup (где-то раз в 200 быстрее)
{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-}
-- | Тупейший (и, надеюсь, быстрый) парсер xml -> теги.
--
-- Выдает где-то от 40МБ/сек (а где-то и 80 и 200)
--
-- Все теги и атрибуты возвращаются в lowercase (строка меняется INPLACE,
-- для уменьшения числа аллокаций и фрагментации)
--
-- После профилирования выяснилось,
-- что TagSoup отжирает 75% памяти и >50% времени. Более того,
-- оказалось у него внутри весь парсинг идет в string и только потом
-- трансформация в Text. Да еще и позиции обрабатываются. В общем куча
-- лишней работы.
--
-- Посмотрел другие библиотеки:
-- * hexpat -- все проверяет, google.ru не разгребает
-- * libxml -- то же самое
--
-- Помимо всего прочего, TagSoup работает с unicode и пришлось
-- использовать Text, хотя это нафиг не нужно. Мы можем тупо работать
-- со strict bytestring (все равно целиком скачиваем) и всю работу
-- проводить в utf-8
--
--
module Lib.XMLParser
( parse, render, parseT, renderT )
where
import Data.Bits
import Control.Monad
import Text.HTML.TagSoup (Tag(..),parseTags,renderTags)
import Text.HTML.TagSoup.Entity
--import Text.HTML.TagSoup (Tag(..))
import qualified Data.ByteString as B8
import qualified Data.ByteString.Char8 as B
import qualified Data.ByteString.Internal as B
import Foreign
import Data.String
-- import Downloader
import qualified Data.ByteString.Lazy.Char8 as BL
import Data.Char
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
instance IsString Word8 where
fromString [c] = B.c2w c
type P = Ptr Word8
r :: P -> Word8
r ptr = B.inlinePerformIO (peek ptr)
{-# INLINE r #-}
instance Enum (Ptr Word8) where
succ p = p `plusPtr` 1
toEnum = error "toEnum: Ptr Word8"
fromEnum = error "toEnum: Ptr Word8"
pp :: Enum a => a -> a
pp !x = succ x
mm !y = pred y
{-# INLINE mm #-}
{-# INLINE pp #-}
-- делаем INPLACE toLower, хотя вроде это не особо влияет, судя по
-- Parser.perfTest (но он и фрагментацию не показывает).
-- А вот проверка перед escape/unescape уменьшает отжираемую память
-- toLowerBS = B.map toLower
toLowerBS bs@(B.PS fp o l) = B.inlinePerformIO $ withForeignPtr fp $ \ p -> do
let go !o 0 = return bs
go !o l = do
w <- peekByteOff p o
pokeByteOff p o (B.c2w $ toLower $ B.w2c w)
go (o+1) (l-1)
go o l
parse :: B.ByteString -> [Tag B.ByteString]
parse s = B.inlinePerformIO $ withForeignPtr fp $ \ p ->
return $
map fixTag $ filter (/= TagText "") $
dat (p `plusPtr` offset) len 0
-- [TagText s]
where (fp, offset, len) = B.toForeignPtr s
fixTag (TagOpen t a) =
TagOpen t (map (\(a,v) -> (toLowerBS a, unescapeHtml v)) $
reverse a)
-- fixTag (TagText t) = TagText (unescapeHtml t)
fixTag t = t
-- По-правильному надо делать так
-- http://www.w3.org/TR/html5/tokenization.html
-- и tagsoup так пытается. Я упростил часть state-ов
-- (особенно comment-ы) и убрал разгребание символов
mkS !left !n = B.fromForeignPtr fp (offset + len - left - n) n
mkText unescape !left !n
| unescape = TagText $ unescapeHtml $ mkS left n
| otherwise = TagText $ mkS left n
-- script/style/CDATA оставляем как есть
script :: P -> Int -> Int -> [Tag B.ByteString]
script p 0 0 = []
script p 0 n = [mkText False 0 n]
script !p !l !n
| nextIC p l (toLowerUpperPairs "</script>") =
mkText False l n : TagClose "script"
: dat (p `plusPtr` 9) (l - 9) 0
| otherwise = script (pp p) (mm l) (pp n)
style :: P -> Int -> Int -> [Tag B.ByteString]
style p 0 0 = []
style p 0 n = [mkText False 0 n]
style !p !l !n
| nextIC p l (toLowerUpperPairs "</style>") =
mkText False l n : TagClose "style"
: dat (p `plusPtr` 8) (l - 8) 0
| otherwise = style (pp p) (mm l) (pp n)
dat :: P -> Int -> Int -> [Tag B.ByteString]
dat p 0 0 = []
dat p 0 n = [mkText True 0 n]
dat !p !left !n
| r p == "<" = mkText True left n : tagOpen (pp p) (mm left)
| otherwise = dat (pp p) (mm left) (pp n)
tdat !t !p !l = t : case t of
TagOpen "script" _ -> script p l 0
TagOpen "style" _ -> style p l 0
_ -> dat p l 0
tagOpen p 0 = dat p 0 1
tagOpen !p !left
| r p == "!" = markupDeclOpen (pp p) (mm left)
| alpha (r p) || r p == "?" = tagName True (pp p) (mm left) 1
| r p == "/" = closeTagOpen (pp p) (mm left)
| otherwise = dat p left 1
closeTagOpen p 0 = dat p 0 2
closeTagOpen !p !left
| alphaBQ (r p) = tagName False (pp p) (mm left) 1
| otherwise = dat p 0 2
tagName o p 0 n = [] -- ничего не выводим, если тег
-- закрылся раньше времени
tagName !o !p !left !n
| space (r p) = beforeAttName tag (pp p) (mm left)
| r p == ">" = tdat tag (pp p) (mm left)
| r p == "?" || r p == "/" =
selfClosingStartTag tag (pp p) (mm left)
| r p == "'" || r p == "\"" = attValue (r p) tag "" (pp p) (mm left) 0
| otherwise = tagName o (pp p) (mm left) (pp n)
where tag | o = TagOpen (toLowerBS $ mkS left n) []
| otherwise = TagClose (toLowerBS $ mkS left n)
markupDeclOpen p 0 = dat p 0 2
markupDeclOpen !p !l
| alpha (r p) = tagName True (pp p) (mm l) 1
| next p l "--" = commentStart (p `plusPtr` 2) (l - 2) 0
| next p l "[CDATA[" = cdataSection (p `plusPtr` 7) (l - 7) 0
| otherwise = dat p l 2
beforeAttName t p 0 = [t]
beforeAttName !t !p !l
| space (r p) = beforeAttName t (pp p) (mm l)
| r p == ">" = tdat t (pp p) (mm l)
| r p == "?" || r p == "/" = selfClosingStartTag t (pp p) (mm l)
| r p == "'" || r p == "\"" = attValue (r p) t "" (pp p) (mm l) 0
| otherwise = attName t (pp p) (mm l) 1
attName t p 0 n = [t]
attName !t !p !l !n
| space (r p) = afterAttName t (mkS l n) (pp p) (mm l)
| r p == ">" = tdat (addAttr t (mkS l n) "") (pp p) (mm l)
| r p == "?" || r p == "/" =
selfClosingStartTag (addAttr t (mkS l n) "") (pp p) (mm l)
| r p == "=" = beforeAttValue t (mkS l n) (pp p) (mm l)
| otherwise = attName t (pp p) (mm l) (pp n)
afterAttName t a p 0 = [t]
afterAttName !t !a !p !l
| space (r p) = afterAttName t a (pp p) (mm l)
| r p == "=" = beforeAttValue t a (pp p) (mm l)
| r p == ">" = tdat (addAttr t a "") (pp p) (mm l)
| r p == "?" || r p == "/" =
selfClosingStartTag (addAttr t a "") (pp p) (mm l)
| r p == "'" || r p == "\"" = attValue (r p) t a (pp p) (mm l) 0
| otherwise = attName (addAttr t a "") (pp p) (mm l) 1
beforeAttValue t a p 0 = [t]
beforeAttValue !t !a !p !l
| space (r p) = beforeAttValue t a (pp p) (mm l)
| r p == ">" = tdat (addAttr t a "") (pp p) (mm l)
-- | (r p == "?" || r p == "/") &&
-- (l == 0 || (r (pp p) == ">")) =
-- selfClosingStartTag (addAttr t a "") (pp p) (mm l)
| r p == "'" || r p == "\"" = attValue (r p) t a (pp p) (mm l) 0
| otherwise = attValueUnquoted t a (pp p) (mm l) 1
attValue end t a p 0 n = [t]
attValue !end !t !a !p !l !n
| r p == end = beforeAttName (addAttr t a (mkS l n)) (pp p) (mm l)
| otherwise = attValue end t a (pp p) (mm l) (pp n)
attValueUnquoted t a p 0 n = [t]
attValueUnquoted !t !a !p !l !n
| space (r p) = beforeAttName (addAttr t a (mkS l n)) (pp p) (mm l)
| r p == ">" -- || r p == "/"
= beforeAttName (addAttr t a (mkS l n)) p l
| otherwise = attValueUnquoted t a (pp p) (mm l) (pp n)
commentStart p 0 n = [] -- комменты никуда не выводим
commentStart !p !l !n
| next p l "-->" = dat (p `plusPtr` 3) (l - 3) 0
| otherwise = commentStart (pp p) (mm l) (pp n)
cdataSection p 0 n = dat p 0 n
cdataSection !p !l !n
| next p l "]]>" = mkText False l n : dat (p `plusPtr` 3) (l-3) 0
| otherwise = cdataSection (pp p) (mm l) (pp n)
selfClosingStartTag t p 0 = closeTag t []
selfClosingStartTag !t !p !l
| r p == ">" = closeTag t $ dat (pp p) (mm l) 0
| otherwise = beforeAttName t p l
addAttr (TagOpen !tn !ta) !a !v = TagOpen tn ((a, v):ta)
addAttr t _ _ = t
closeTag !t@(TagOpen !tn _) !r = t : TagClose tn : r
closeTag !t !r = t : r
space 0x20 = True
space !n = n >= 9 && n <= 13 -- \t\n\v\f\r
alpha !c = (c >= "a" && c <= "z") || (c >= "A" && c <= "Z")
alphaBQ !c = alpha c || c == "?" || c == "!"
next p l [] = True
next p 0 _ = False
next !p !l (x:xs) = (r p == B.c2w x) && next (pp p) (mm l) xs
toLowerUpperPairs s = [(B.c2w $ toLower x, B.c2w $ toUpper x) | x <- s]
nextIC p l [] = True
nextIC p 0 _ = False
nextIC !p !l ((a,b):xs) = (r p == a || r p == b) && nextIC (pp p) (mm l) xs
{-# INLINE parse #-}
-- ord maxBound = 1114111 = 0x10FFFF = &#1114111; &#x10FFFF;
-- максимум 8 символов между & и ;
-- буквенные обозначения также не больше 8 символов
-- т.е. искать ';' после '&' можно в пределах ближайших 9 символов
unescapeHtml :: B.ByteString -> B.ByteString
unescapeHtml s
| not (B8.elem "&" s) = s
| B.length s' == 0 = s -- дабы лишняя аллокация сразу прибилась
| otherwise = s'
where s' = B.inlinePerformIO $ withForeignPtr fp $ \ src ->
B.createAndTrim (len*2) $ \ dst -> do
(ch, dst') <- go (src `plusPtr` offset) dst len False
return $ if ch then dst' `minusPtr` dst else 0
(fp, offset, len) = B.toForeignPtr s
go :: P -> P -> Int -> Bool -> IO (Bool, P)
go !s !d !0 !c = return (c, d)
go !s !d !l !c
-- | r s .&. (0x8 + 0x4) == 0x8
| r s /= "&" =
poke d (r s) >> go (pp s) (pp d) (mm l) c
| otherwise = entity (pp s) d (mm l) 9 [] c
-- add !c !s !d !n !l =
-- poke d c >> go (s `plusPtr` n) (pp d) (l - n) True
entity !s !d !l !n !acc !c
| n == 0 || l == 0 =
poke d "&" >> go (s `plusPtr` (n-9)) (pp d) (l+9-n) c
| r s /= ";" =
entity (pp s) d (mm l) (mm n) (B.w2c (r s) : acc) c
| otherwise = case lookupEntity $ reverse acc of
Nothing ->
poke d "&" >> go (s `plusPtr` (n-9)) (pp d) (l+9-n) c
Just i -> do
let put !d [] = go (pp s) d (mm l) True
put !d (x:xs) =
poke d x >> put (pp d) xs
put d $ encodeChar i
encodeChar :: Char -> [Word8]
encodeChar = map fromIntegral . go . ord
where
go oc
| oc <= 0x7f = [oc]
| oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6)
, 0x80 + oc .&. 0x3f
]
| oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
| otherwise = [ 0xf0 + (oc `shiftR` 18)
, 0x80 + ((oc `shiftR` 12) .&. 0x3f)
, 0x80 + ((oc `shiftR` 6) .&. 0x3f)
, 0x80 + oc .&. 0x3f
]
bst = T.decodeUtf8With (\ _ -> fmap B.w2c)
parseT = map textTag . parse
textTag (TagOpen t a) = TagOpen (bst t) [(bst n, bst v) | (n,v) <- a]
textTag (TagClose t) = TagClose (bst t)
textTag (TagText t) = TagText (bst t)
textTag (TagComment t) = TagComment (bst t)
textTag (TagWarning t) = TagWarning (bst t)
textTag (TagPosition r c) = TagPosition r c
escapeHtmlT :: T.Text -> T.Text
escapeHtmlT s
| not $ T.any (\ c -> c=='&'||c=='<'||c=='>'||c=='\''||c=='"') s = s
| otherwise = bst $ escapeHtml $ T.encodeUtf8 s
escapeHtml :: B.ByteString -> B.ByteString
escapeHtml s
| Nothing <- B8.find (\ c -> c=="&"||c=="<"||c==">"||c=="'"||c=="\"") s = s
| B.length s' == 0 = s
| otherwise = s'
where s' = B.inlinePerformIO $ withForeignPtr fp $ \ src ->
B.createAndTrim (len*6) $ \ dst -> do
(ch, dst') <- go (src `plusPtr` offset) dst len False
return $ if ch then dst' `minusPtr` dst else 0
(fp, offset, len) = B.toForeignPtr s
go :: P -> P -> Int -> Bool -> IO (Bool, P)
go !s !d !0 !c = return (c, d)
go !s !d !l !c
| r s == "&" = add s d l $ map B.c2w "&amp;"
| r s == "<" = add s d l $ map B.c2w "&lt;"
| r s == ">" = add s d l $ map B.c2w "&gt;"
| r s == "'" = add s d l $ map B.c2w "&apos;"
| r s == "\"" = add s d l $ map B.c2w "&quot;"
| otherwise =
poke d (r s) >> go (pp s) (pp d) (mm l) c
add !s !d !l [] = go (pp s) d (mm l) True
add !s !d !l (x:xs) = do
poke d x
add s (pp d) l xs
{-# INLINE unescapeHtml #-}
-- nTest f n u = do
-- c <- urlGetContents u
-- forM_ [1..n] $ \ i -> print $ length $ f $ B.concat [B.pack (show i), c]
-- test u = urlGetContents u >>= print . length . parse
-- test2 u = urlGetContents u >>= print . length . parseTags
render = render' escapeHtml B.concat
renderT = render' escapeHtmlT T.concat
render' escape concat = go []
where go acc [] = concat $ reverse acc
go acc (TagOpen t as : TagClose tc : ts) | t == tc =
go ("/>" : renderAtts (reverse as) (t : "<" : acc)) ts
go acc (TagOpen t as : ts) =
go (">" : renderAtts (reverse as) (t : "<" : acc)) ts
go acc (TagClose t : ts) =
go (">" : t : "</" : acc) ts
go acc (TagText t : ts) =
go (escape t : acc) ts
renderAtts [] r = r
renderAtts ((a,v):as) r =
"\"" : escape v : "=\"" : a : " " : renderAtts as r
{-# INLINE render #-}
-- -- | Использует тот факт, что внутри у нас одна строка и теги подряд
-- -- можно не рендерить, а тупо брать смещения.
-- renderSeqTags [] = ""
-- renderSeqTags (t:ts) =
-- where tagS (TagOpen
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment