copied some of cassavas parsing stuff. they omptimize pretty well.
This commit is contained in:
parent
db725eba69
commit
cb11a8bd9a
@ -20,6 +20,7 @@ library
|
||||
Siphon
|
||||
Siphon.Types
|
||||
Siphon.Encoding
|
||||
Siphon.Internal
|
||||
build-depends:
|
||||
base >= 4.7 && < 5
|
||||
, colonnade
|
||||
@ -28,6 +29,7 @@ library
|
||||
, contravariant
|
||||
, vector
|
||||
, pipes
|
||||
, attoparsec
|
||||
default-language: Haskell2010
|
||||
|
||||
source-repository head
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
module Siphon.Decoding where
|
||||
|
||||
import Siphon.Types
|
||||
import qualified Data.Attoparsec.Types as Atto
|
||||
|
||||
-- unrow :: c1 -> (Vector c2,c1)
|
||||
--
|
||||
@ -8,3 +9,33 @@ import Siphon.Types
|
||||
-- -> Decoding (Indexed f) c a
|
||||
-- -> Vector c
|
||||
-- -> Either DecodingErrors a
|
||||
|
||||
pipe :: SiphonDecoding c1 c2
|
||||
-> Atto.Parser c1 (WithEnd c2)
|
||||
-> Pipe c1 (Vector c2) m String
|
||||
pipe (SiphonDecoding parse isNull) p = do
|
||||
c1 <- awaitSkip isNull
|
||||
case parse p c1 of
|
||||
|
||||
awaitSkip :: (a -> Bool)
|
||||
-> Consumer' a m a
|
||||
awaitSkip f = go where
|
||||
go = do
|
||||
a <- await
|
||||
if f a then go else return a
|
||||
|
||||
nextSkipEmpty
|
||||
:: (Monad m, Eq a, Monoid a)
|
||||
=> Producer a m r
|
||||
-> m (Either r (a, Producer a m r))
|
||||
nextSkipEmpty = go where
|
||||
go p0 = do
|
||||
x <- next p0
|
||||
case x of
|
||||
Left _ -> return x
|
||||
Right (a,p1)
|
||||
| a == mempty -> go p1
|
||||
| otherwise -> return x
|
||||
{-# INLINABLE nextSkipEmpty #-}
|
||||
|
||||
|
||||
|
||||
188
siphon/src/Siphon/Internal.hs
Normal file
188
siphon/src/Siphon/Internal.hs
Normal file
@ -0,0 +1,188 @@
|
||||
{-# LANGUAGE BangPatterns #-}
|
||||
|
||||
-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
|
||||
-- the following extensions:
|
||||
--
|
||||
-- * Empty lines are ignored.
|
||||
--
|
||||
-- * Non-escaped fields may contain any characters except
|
||||
-- double-quotes, commas, carriage returns, and newlines.
|
||||
--
|
||||
-- * Escaped fields may contain any characters (but double-quotes
|
||||
-- need to be escaped).
|
||||
--
|
||||
-- The functions in this module can be used to implement e.g. a
|
||||
-- resumable parser that is fed input incrementally.
|
||||
module Siphon.Internal where
|
||||
|
||||
import Siphon.Types
|
||||
|
||||
import Data.ByteString.Builder (toLazyByteString,byteString)
|
||||
import qualified Data.ByteString.Char8 as BC8
|
||||
import Control.Applicative (optional)
|
||||
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
|
||||
import qualified Data.Attoparsec.ByteString as A
|
||||
import qualified Data.Attoparsec.Lazy as AL
|
||||
import qualified Data.Attoparsec.Zepto as Z
|
||||
import qualified Data.ByteString as S
|
||||
import qualified Data.ByteString.Unsafe as S
|
||||
import qualified Data.Vector as V
|
||||
import qualified Data.ByteString as B
|
||||
import qualified Data.ByteString.Lazy as LByteString
|
||||
import Data.Word (Word8)
|
||||
import Data.Vector (Vector)
|
||||
import Data.ByteString (ByteString)
|
||||
|
||||
import Control.Applicative
|
||||
import Data.Monoid
|
||||
|
||||
-- parse :: Monad m
|
||||
-- => SiphonDecoding c1 c2
|
||||
-- -> Atto.Parser a b -- ^ Attoparsec parser
|
||||
-- -> Pipes.Parser a m (Maybe (Either ParsingError b)) -- ^ Pipes parser
|
||||
-- parse parser = S.StateT $ \p0 -> do
|
||||
-- x <- nextSkipEmpty p0
|
||||
-- case x of
|
||||
-- Left r -> return (Nothing, return r)
|
||||
-- Right (a,p1) -> step (yield a >>) (_parse parser a) p1
|
||||
-- where
|
||||
-- step diffP res p0 = case res of
|
||||
-- Fail _ c m -> return (Just (Left (ParsingError c m)), diffP p0)
|
||||
-- Done a b -> return (Just (Right b), yield a >> p0)
|
||||
-- Partial k -> do
|
||||
-- x <- nextSkipEmpty p0
|
||||
-- case x of
|
||||
-- Left e -> step diffP (k mempty) (return e)
|
||||
-- Right (a,p1) -> step (diffP . (yield a >>)) (k a) p1
|
||||
|
||||
-- | Specialized version of 'sepBy1'' which is faster due to not
|
||||
-- accepting an arbitrary separator.
|
||||
sepByDelim1' :: AL.Parser a
|
||||
-> Word8 -- ^ Field delimiter
|
||||
-> AL.Parser [a]
|
||||
sepByDelim1' p !delim = liftM2' (:) p loop
|
||||
where
|
||||
loop = do
|
||||
mb <- A.peekWord8
|
||||
case mb of
|
||||
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
|
||||
_ -> pure []
|
||||
{-# INLINE sepByDelim1' #-}
|
||||
|
||||
-- | Specialized version of 'sepBy1'' which is faster due to not
|
||||
-- accepting an arbitrary separator.
|
||||
sepByEndOfLine1' :: AL.Parser a
|
||||
-> AL.Parser [a]
|
||||
sepByEndOfLine1' p = liftM2' (:) p loop
|
||||
where
|
||||
loop = do
|
||||
mb <- A.peekWord8
|
||||
case mb of
|
||||
Just b | b == cr ->
|
||||
liftM2' (:) (A.anyWord8 *> A.word8 newline *> p) loop
|
||||
| b == newline ->
|
||||
liftM2' (:) (A.anyWord8 *> p) loop
|
||||
_ -> pure []
|
||||
{-# INLINE sepByEndOfLine1' #-}
|
||||
|
||||
-- | Parse a record, not including the terminating line separator. The
|
||||
-- terminating line separate is not included as the last record in a
|
||||
-- CSV file is allowed to not have a terminating line separator. You
|
||||
-- most likely want to use the 'endOfLine' parser in combination with
|
||||
-- this parser.
|
||||
row :: Word8 -- ^ Field delimiter
|
||||
-> AL.Parser (Vector ByteString)
|
||||
row !delim = V.fromList <$!> field delim `sepByDelim1'` delim <* endOfLine
|
||||
{-# INLINE row #-}
|
||||
|
||||
removeBlankLines :: [Vector ByteString] -> [Vector ByteString]
|
||||
removeBlankLines = filter (not . blankLine)
|
||||
|
||||
-- | Parse a field. The field may be in either the escaped or
|
||||
-- non-escaped format. The return value is unescaped.
|
||||
field :: Word8 -> AL.Parser ByteString
|
||||
field !delim = do
|
||||
mb <- A.peekWord8
|
||||
-- We purposely don't use <|> as we want to commit to the first
|
||||
-- choice if we see a double quote.
|
||||
case mb of
|
||||
Just b | b == doubleQuote -> escapedField
|
||||
_ -> unescapedField delim
|
||||
{-# INLINE field #-}
|
||||
|
||||
escapedField :: AL.Parser S.ByteString
|
||||
escapedField = do
|
||||
_ <- dquote
|
||||
-- The scan state is 'True' if the previous character was a double
|
||||
-- quote. We need to drop a trailing double quote left by scan.
|
||||
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
|
||||
then Just (not s)
|
||||
else if s then Nothing
|
||||
else Just False)
|
||||
if doubleQuote `S.elem` s
|
||||
then case Z.parse unescape s of
|
||||
Right r -> return r
|
||||
Left err -> fail err
|
||||
else return s
|
||||
|
||||
unescapedField :: Word8 -> AL.Parser S.ByteString
|
||||
unescapedField !delim = A.takeWhile (\ c -> c /= doubleQuote &&
|
||||
c /= newline &&
|
||||
c /= delim &&
|
||||
c /= cr)
|
||||
|
||||
dquote :: AL.Parser Char
|
||||
dquote = char '"'
|
||||
|
||||
-- | This could be improved. We could avoid the builder and just
|
||||
-- write to a buffer directly.
|
||||
unescape :: Z.Parser S.ByteString
|
||||
unescape = (LByteString.toStrict . toLazyByteString) <$!> go mempty where
|
||||
go acc = do
|
||||
h <- Z.takeWhile (/= doubleQuote)
|
||||
let rest = do
|
||||
start <- Z.take 2
|
||||
if (S.unsafeHead start == doubleQuote &&
|
||||
S.unsafeIndex start 1 == doubleQuote)
|
||||
then go (acc `mappend` byteString h `mappend` byteString (BC8.singleton '"'))
|
||||
else fail "invalid CSV escape sequence"
|
||||
done <- Z.atEnd
|
||||
if done
|
||||
then return (acc `mappend` byteString h)
|
||||
else rest
|
||||
|
||||
-- | A strict version of 'Data.Functor.<$>' for monads.
|
||||
(<$!>) :: Monad m => (a -> b) -> m a -> m b
|
||||
f <$!> m = do
|
||||
a <- m
|
||||
return $! f a
|
||||
{-# INLINE (<$!>) #-}
|
||||
|
||||
infixl 4 <$!>
|
||||
|
||||
-- | Is this an empty record (i.e. a blank line)?
|
||||
blankLine :: V.Vector B.ByteString -> Bool
|
||||
blankLine v = V.length v == 1 && (B.null (V.head v))
|
||||
|
||||
-- | A version of 'liftM2' that is strict in the result of its first
|
||||
-- action.
|
||||
liftM2' :: (Monad m) => (a -> b -> c) -> m a -> m b -> m c
|
||||
liftM2' f a b = do
|
||||
!x <- a
|
||||
y <- b
|
||||
return (f x y)
|
||||
{-# INLINE liftM2' #-}
|
||||
|
||||
|
||||
-- | Match either a single newline character @\'\\n\'@, or a carriage
|
||||
-- return followed by a newline character @\"\\r\\n\"@, or a single
|
||||
-- carriage return @\'\\r\'@.
|
||||
endOfLine :: A.Parser ()
|
||||
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
|
||||
{-# INLINE endOfLine #-}
|
||||
|
||||
doubleQuote, newline, cr :: Word8
|
||||
doubleQuote = 34
|
||||
newline = 10
|
||||
cr = 13
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
module Siphon.Types where
|
||||
|
||||
import Data.Vector (Vector)
|
||||
import qualified Data.Attoparsec.Types as Atto
|
||||
|
||||
newtype Escaped c = Escaped { getEscaped :: c }
|
||||
|
||||
@ -12,6 +13,17 @@ data Siphon c1 c2 = Siphon
|
||||
, siphonIntercalate :: !(Vector (Escaped c2) -> c2)
|
||||
}
|
||||
|
||||
-- data Clarify = Clarify
|
||||
data SiphonDecoding c1 c2 = SiphonDecoding
|
||||
{ siphonDecodingParse :: Atto.Parser c1 c2 -> c1 -> Atto.IResult c1 c2
|
||||
, siphonDecodingNull :: c1 -> Bool
|
||||
}
|
||||
|
||||
data WithEnd c = WithEnd
|
||||
{ withEndEnded :: Bool
|
||||
, withEndContent :: c
|
||||
}
|
||||
|
||||
-- data SiphonDecodingError
|
||||
-- { clarify
|
||||
-- }
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user