From cb11a8bd9a50d71009e572a6e2233fb4fd047fb5 Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Sun, 26 Jun 2016 13:48:53 -0400 Subject: [PATCH] copied some of cassavas parsing stuff. they omptimize pretty well. --- siphon/siphon.cabal | 2 + siphon/src/Siphon/Decoding.hs | 31 ++++++ siphon/src/Siphon/Internal.hs | 188 ++++++++++++++++++++++++++++++++++ siphon/src/Siphon/Types.hs | 14 ++- 4 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 siphon/src/Siphon/Internal.hs diff --git a/siphon/siphon.cabal b/siphon/siphon.cabal index 82a0da7..dd3fb45 100644 --- a/siphon/siphon.cabal +++ b/siphon/siphon.cabal @@ -20,6 +20,7 @@ library Siphon Siphon.Types Siphon.Encoding + Siphon.Internal build-depends: base >= 4.7 && < 5 , colonnade @@ -28,6 +29,7 @@ library , contravariant , vector , pipes + , attoparsec default-language: Haskell2010 source-repository head diff --git a/siphon/src/Siphon/Decoding.hs b/siphon/src/Siphon/Decoding.hs index 94754f4..0b14fd3 100644 --- a/siphon/src/Siphon/Decoding.hs +++ b/siphon/src/Siphon/Decoding.hs @@ -1,6 +1,7 @@ module Siphon.Decoding where import Siphon.Types +import qualified Data.Attoparsec.Types as Atto -- unrow :: c1 -> (Vector c2,c1) -- @@ -8,3 +9,33 @@ import Siphon.Types -- -> Decoding (Indexed f) c a -- -> Vector c -- -> Either DecodingErrors a + +pipe :: SiphonDecoding c1 c2 + -> Atto.Parser c1 (WithEnd c2) + -> Pipe c1 (Vector c2) m String +pipe (SiphonDecoding parse isNull) p = do + c1 <- awaitSkip isNull + case parse p c1 of + +awaitSkip :: (a -> Bool) + -> Consumer' a m a +awaitSkip f = go where + go = do + a <- await + if f a then go else return a + +nextSkipEmpty + :: (Monad m, Eq a, Monoid a) + => Producer a m r + -> m (Either r (a, Producer a m r)) +nextSkipEmpty = go where + go p0 = do + x <- next p0 + case x of + Left _ -> return x + Right (a,p1) + | a == mempty -> go p1 + | otherwise -> return x +{-# INLINABLE nextSkipEmpty #-} + + diff --git a/siphon/src/Siphon/Internal.hs b/siphon/src/Siphon/Internal.hs new file mode 100644 index 0000000..d10283b --- /dev/null +++ b/siphon/src/Siphon/Internal.hs @@ -0,0 +1,188 @@ +{-# LANGUAGE BangPatterns #-} + +-- | A CSV parser. The parser defined here is RFC 4180 compliant, with +-- the following extensions: +-- +-- * Empty lines are ignored. +-- +-- * Non-escaped fields may contain any characters except +-- double-quotes, commas, carriage returns, and newlines. +-- +-- * Escaped fields may contain any characters (but double-quotes +-- need to be escaped). +-- +-- The functions in this module can be used to implement e.g. a +-- resumable parser that is fed input incrementally. +module Siphon.Internal where + +import Siphon.Types + +import Data.ByteString.Builder (toLazyByteString,byteString) +import qualified Data.ByteString.Char8 as BC8 +import Control.Applicative (optional) +import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string) +import qualified Data.Attoparsec.ByteString as A +import qualified Data.Attoparsec.Lazy as AL +import qualified Data.Attoparsec.Zepto as Z +import qualified Data.ByteString as S +import qualified Data.ByteString.Unsafe as S +import qualified Data.Vector as V +import qualified Data.ByteString as B +import qualified Data.ByteString.Lazy as LByteString +import Data.Word (Word8) +import Data.Vector (Vector) +import Data.ByteString (ByteString) + +import Control.Applicative +import Data.Monoid + +-- parse :: Monad m +-- => SiphonDecoding c1 c2 +-- -> Atto.Parser a b -- ^ Attoparsec parser +-- -> Pipes.Parser a m (Maybe (Either ParsingError b)) -- ^ Pipes parser +-- parse parser = S.StateT $ \p0 -> do +-- x <- nextSkipEmpty p0 +-- case x of +-- Left r -> return (Nothing, return r) +-- Right (a,p1) -> step (yield a >>) (_parse parser a) p1 +-- where +-- step diffP res p0 = case res of +-- Fail _ c m -> return (Just (Left (ParsingError c m)), diffP p0) +-- Done a b -> return (Just (Right b), yield a >> p0) +-- Partial k -> do +-- x <- nextSkipEmpty p0 +-- case x of +-- Left e -> step diffP (k mempty) (return e) +-- Right (a,p1) -> step (diffP . (yield a >>)) (k a) p1 + +-- | Specialized version of 'sepBy1'' which is faster due to not +-- accepting an arbitrary separator. +sepByDelim1' :: AL.Parser a + -> Word8 -- ^ Field delimiter + -> AL.Parser [a] +sepByDelim1' p !delim = liftM2' (:) p loop + where + loop = do + mb <- A.peekWord8 + case mb of + Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop + _ -> pure [] +{-# INLINE sepByDelim1' #-} + +-- | Specialized version of 'sepBy1'' which is faster due to not +-- accepting an arbitrary separator. +sepByEndOfLine1' :: AL.Parser a + -> AL.Parser [a] +sepByEndOfLine1' p = liftM2' (:) p loop + where + loop = do + mb <- A.peekWord8 + case mb of + Just b | b == cr -> + liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop + | b == newline -> + liftM2' (:) (A.anyWord8 *> p) loop + _ -> pure [] +{-# INLINE sepByEndOfLine1' #-} + +-- | Parse a record, not including the terminating line separator. The +-- terminating line separate is not included as the last record in a +-- CSV file is allowed to not have a terminating line separator. You +-- most likely want to use the 'endOfLine' parser in combination with +-- this parser. +row :: Word8 -- ^ Field delimiter + -> AL.Parser (Vector ByteString) +row !delim = V.fromList <$!> field delim `sepByDelim1'` delim <* endOfLine +{-# INLINE row #-} + +removeBlankLines :: [Vector ByteString] -> [Vector ByteString] +removeBlankLines = filter (not . blankLine) + +-- | Parse a field. The field may be in either the escaped or +-- non-escaped format. The return value is unescaped. +field :: Word8 -> AL.Parser ByteString +field !delim = do + mb <- A.peekWord8 + -- We purposely don't use <|> as we want to commit to the first + -- choice if we see a double quote. + case mb of + Just b | b == doubleQuote -> escapedField + _ -> unescapedField delim +{-# INLINE field #-} + +escapedField :: AL.Parser S.ByteString +escapedField = do + _ <- dquote + -- The scan state is 'True' if the previous character was a double + -- quote. We need to drop a trailing double quote left by scan. + s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote + then Just (not s) + else if s then Nothing + else Just False) + if doubleQuote `S.elem` s + then case Z.parse unescape s of + Right r -> return r + Left err -> fail err + else return s + +unescapedField :: Word8 -> AL.Parser S.ByteString +unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote && + c /= newline && + c /= delim && + c /= cr) + +dquote :: AL.Parser Char +dquote = char '"' + +-- | This could be improved. We could avoid the builder and just +-- write to a buffer directly. +unescape :: Z.Parser S.ByteString +unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where + go acc = do + h <- Z.takeWhile (/= doubleQuote) + let rest = do + start <- Z.take 2 + if (S.unsafeHead start == doubleQuote && + S.unsafeIndex start 1 == doubleQuote) + then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"')) + else fail "invalid CSV escape sequence" + done <- Z.atEnd + if done + then return (acc `mappend` byteString h) + else rest + +-- | A strict version of 'Data.Functor.<$>' for monads. +(<$!>) :: Monad m => (a -> b) -> m a -> m b +f <$!> m = do + a <- m + return $! f a +{-# INLINE (<$!>) #-} + +infixl 4 <$!> + +-- | Is this an empty record (i.e. a blank line)? +blankLine :: V.Vector B.ByteString -> Bool +blankLine v = V.length v == 1 && (B.null (V.head v)) + +-- | A version of 'liftM2' that is strict in the result of its first +-- action. +liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c +liftM2' f a b = do + !x <- a + y <- b + return (f x y) +{-# INLINE liftM2' #-} + + +-- | Match either a single newline character @\'\\n\'@, or a carriage +-- return followed by a newline character @\"\\r\\n\"@, or a single +-- carriage return @\'\\r\'@. +endOfLine :: A.Parser () +endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ()) +{-# INLINE endOfLine #-} + +doubleQuote, newline, cr :: Word8 +doubleQuote = 34 +newline = 10 +cr = 13 + diff --git a/siphon/src/Siphon/Types.hs b/siphon/src/Siphon/Types.hs index 083f695..08b8cf3 100644 --- a/siphon/src/Siphon/Types.hs +++ b/siphon/src/Siphon/Types.hs @@ -1,6 +1,7 @@ module Siphon.Types where import Data.Vector (Vector) +import qualified Data.Attoparsec.Types as Atto newtype Escaped c = Escaped { getEscaped :: c } @@ -12,6 +13,17 @@ data Siphon c1 c2 = Siphon , siphonIntercalate :: !(Vector (Escaped c2) -> c2) } --- data Clarify = Clarify +data SiphonDecoding c1 c2 = SiphonDecoding + { siphonDecodingParse :: Atto.Parser c1 c2 -> c1 -> Atto.IResult c1 c2 + , siphonDecodingNull :: c1 -> Bool + } + +data WithEnd c = WithEnd + { withEndEnded :: Bool + , withEndContent :: c + } + +-- data SiphonDecodingError -- { clarify -- } +