redo interface to siphon

This commit is contained in:
Andrew Martin 2017-12-14 22:30:01 -05:00
parent 4f3e83a908
commit f115e7798b
4 changed files with 176 additions and 52 deletions

View File

@ -1,5 +1,5 @@
name: siphon name: siphon
version: 0.7.2 version: 0.8.0
synopsis: Encode and decode CSV files synopsis: Encode and decode CSV files
description: Please see README.md description: Please see README.md
homepage: https://github.com/andrewthad/colonnade#readme homepage: https://github.com/andrewthad/colonnade#readme
@ -19,13 +19,23 @@ library
Siphon.Types Siphon.Types
build-depends: build-depends:
base >= 4.9 && < 5 base >= 4.9 && < 5
, colonnade >= 1.1 && < 1.3 , colonnade >= 1.1.1 && < 1.3
, text , text >= 1.0 && < 1.3
, bytestring , bytestring
, vector , vector
, streaming , streaming >= 0.1.4 && < 0.3
, attoparsec , attoparsec
, transformers , transformers >= 0.5 && < 0.6
default-language: Haskell2010
test-suite doctest
type: exitcode-stdio-1.0
hs-source-dirs: test
main-is: Doctest.hs
build-depends:
base
, siphon
, doctest >= 0.10
default-language: Haskell2010 default-language: Haskell2010
test-suite siphon-test test-suite siphon-test

View File

@ -3,18 +3,43 @@
{-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE RankNTypes #-} {-# LANGUAGE RankNTypes #-}
-- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-} {-# OPTIONS_GHC -Wall -fno-warn-unused-imports #-}
-- | Build CSVs using the abstractions provided in the @colonnade@ library, and
-- parse CSVs using 'Siphon', which is the dual of 'Colonnade'.
-- Read the documentation for @colonnade@ before reading the documentation
-- for @siphon@. All of the examples on this page assume the following
-- setup:
--
-- >>> :set -XOverloadedStrings
-- >>> import Siphon (Siphon)
-- >>> import Colonnade (Colonnade,Headed)
-- >>> import qualified Siphon as S
-- >>> import qualified Colonnade as C
-- >>> import qualified Data.Text as T
-- >>> import qualified Data.Text.Lazy.IO as LTIO
-- >>> import qualified Data.Text.Lazy.Builder as LB
-- >>> import Data.Text (Text)
-- >>> import Data.Maybe (fromMaybe)
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
module Siphon module Siphon
( Siphon ( -- * Encode CSV
, SiphonError encodeCsv
, Indexed(..) , encodeCsvStream
, encodeCsvUtf8
, encodeCsvStreamUtf8
-- * Decode CSV
, decodeHeadedUtf8Csv , decodeHeadedUtf8Csv
, encodeHeadedUtf8Csv -- * Build Siphon
, humanizeSiphonError
, headed , headed
, headless , headless
, indexed , indexed
-- * Types
, Siphon
, SiphonError
, Indexed(..)
-- * Utility
, humanizeSiphonError
) where ) where
import Siphon.Types import Siphon.Types
@ -32,6 +57,8 @@ import qualified Data.Vector as V
import qualified Data.ByteString as B import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as LByteString import qualified Data.ByteString.Lazy as LByteString
import qualified Data.ByteString.Builder as Builder import qualified Data.ByteString.Builder as Builder
import qualified Data.Text.Lazy as LT
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text as T import qualified Data.Text as T
import qualified Data.List as L import qualified Data.List as L
import qualified Streaming as SM import qualified Streaming as SM
@ -39,9 +66,10 @@ import qualified Streaming.Prelude as SMP
import qualified Data.Attoparsec.Types as ATYP import qualified Data.Attoparsec.Types as ATYP
import qualified Colonnade.Encode as CE import qualified Colonnade.Encode as CE
import qualified Data.Vector.Mutable as MV import qualified Data.Vector.Mutable as MV
import qualified Data.ByteString.Builder as BB
import Control.Monad.Trans.Class import Control.Monad.Trans.Class
import Data.Functor.Identity (Identity(..))
import Data.ByteString.Builder (toLazyByteString,byteString) import Data.ByteString.Builder (toLazyByteString,byteString)
import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string) import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
import Data.Word (Word8) import Data.Word (Word8)
@ -53,6 +81,7 @@ import Data.Text.Encoding (decodeUtf8')
import Streaming (Stream,Of(..)) import Streaming (Stream,Of(..))
import Data.Vector.Mutable (MVector) import Data.Vector.Mutable (MVector)
import Control.Monad.ST import Control.Monad.ST
import Data.Text (Text)
newtype Escaped c = Escaped { getEscaped :: c } newtype Escaped c = Escaped { getEscaped :: c }
data Ended = EndedYes | EndedNo data Ended = EndedYes | EndedNo
@ -74,40 +103,104 @@ decodeHeadedUtf8Csv headedSiphon s1 = do
let requiredLength = V.length v let requiredLength = V.length v
consumeBodyUtf8 1 requiredLength ixedSiphon s2 consumeBodyUtf8 1 requiredLength ixedSiphon s2
encodeHeadedUtf8Csv :: Monad m encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h)
=> CE.Colonnade CE.Headed a ByteString => CE.Colonnade h a ByteString
-> Stream (Of a) m r -> Stream (Of a) m r
-> Stream (Of ByteString) m r -> Stream (Of ByteString) m r
encodeHeadedUtf8Csv = encodeCsvStreamUtf8 =
encodeHeadedCsv escapeChar8 (B.singleton comma) (B.singleton newline) encodeCsvInternal escapeChar8 (B.singleton comma) (B.singleton newline)
encodeHeadedCsv :: Monad m encodeCsvStream :: (Monad m, CE.Headedness h)
=> CE.Colonnade h a Text
-> Stream (Of a) m r
-> Stream (Of Text) m r
encodeCsvStream =
encodeCsvInternal textEscapeChar8 (T.singleton ',') (T.singleton '\n')
-- | Encode a collection to a CSV as a text 'TB.Builder'. For example,
-- we can take the following columnar encoding of a person:
--
-- >>> :{
-- let colPerson :: Colonnade Headed Person Text
-- colPerson = mconcat
-- [ C.headed "Name" name
-- , C.headed "Age" (T.pack . show . age)
-- , C.headed "Company" (fromMaybe "N/A" . company)
-- ]
-- :}
--
-- And we have the following people whom we wish to encode
-- in this way:
--
-- >>> :{
-- let people :: [Person]
-- people =
-- [ Person "Chao" 26 (Just "Tectonic, Inc.")
-- , Person "Elsie" 41 (Just "Globex Corporation")
-- , Person "Arabella" 19 Nothing
-- ]
-- :}
--
-- We pair the encoding with the rows to get a CSV:
--
-- >>> LTIO.putStr (TB.toLazyText (encodeCsv colPerson people))
-- Name,Age,Company
-- Chao,26,"Tectonic, Inc."
-- Elsie,41,Globex Corporation
-- Arabella,19,N/A
encodeCsv :: (Foldable f, CE.Headedness h)
=> CE.Colonnade h a Text -- ^ Tablular encoding
-> f a -- ^ Value of each row
-> TB.Builder
encodeCsv enc =
textStreamToBuilder . encodeCsvStream enc . SMP.each
-- | Encode a collection to a CSV as a bytestring 'BB.Builder'.
encodeCsvUtf8 :: (Foldable f, CE.Headedness h)
=> CE.Colonnade h a ByteString -- ^ Tablular encoding
-> f a -- ^ Value of each row
-> BB.Builder
encodeCsvUtf8 enc =
streamToBuilder . encodeCsvStreamUtf8 enc . SMP.each
streamToBuilder :: Stream (Of ByteString) Identity () -> BB.Builder
streamToBuilder s = SM.destroy s
(\(bs :> bb) -> BB.byteString bs <> bb) runIdentity (\() -> mempty)
textStreamToBuilder :: Stream (Of Text) Identity () -> TB.Builder
textStreamToBuilder s = SM.destroy s
(\(bs :> bb) -> TB.fromText bs <> bb) runIdentity (\() -> mempty)
encodeCsvInternal :: (Monad m, CE.Headedness h)
=> (c -> Escaped c) => (c -> Escaped c)
-> c -- ^ separator -> c -- ^ separator
-> c -- ^ newline -> c -- ^ newline
-> CE.Colonnade CE.Headed a c -> CE.Colonnade h a c
-> Stream (Of a) m r -> Stream (Of a) m r
-> Stream (Of c) m r -> Stream (Of c) m r
encodeHeadedCsv escapeFunc separatorStr newlineStr colonnade s = do encodeCsvInternal escapeFunc separatorStr newlineStr colonnade s = do
encodeHeader escapeFunc separatorStr newlineStr colonnade case CE.headednessExtract of
Just toContent -> encodeHeader toContent escapeFunc separatorStr newlineStr colonnade
Nothing -> return ()
encodeRows escapeFunc separatorStr newlineStr colonnade s encodeRows escapeFunc separatorStr newlineStr colonnade s
encodeHeader :: Monad m encodeHeader :: Monad m
=> (c -> Escaped c) => (h c -> c)
-> (c -> Escaped c)
-> c -- ^ separator -> c -- ^ separator
-> c -- ^ newline -> c -- ^ newline
-> CE.Colonnade CE.Headed a c -> CE.Colonnade h a c
-> Stream (Of c) m () -> Stream (Of c) m ()
encodeHeader escapeFunc separatorStr newlineStr colonnade = do encodeHeader toContent escapeFunc separatorStr newlineStr colonnade = do
let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade) let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
-- we only need to do this split because the first cell -- we only need to do this split because the first cell
-- gets treated differently than the others. It does not -- gets treated differently than the others. It does not
-- get a separator added before it. -- get a separator added before it.
V.forM_ vs $ \(CE.OneColonnade (CE.Headed h) _) -> do V.forM_ vs $ \(CE.OneColonnade h _) -> do
SMP.yield (getEscaped (escapeFunc h)) SMP.yield (getEscaped (escapeFunc (toContent h)))
V.forM_ ws $ \(CE.OneColonnade (CE.Headed h) _) -> do V.forM_ ws $ \(CE.OneColonnade h _) -> do
SMP.yield separatorStr SMP.yield separatorStr
SMP.yield (getEscaped (escapeFunc h)) SMP.yield (getEscaped (escapeFunc (toContent h)))
SMP.yield newlineStr SMP.yield newlineStr
mapStreamM :: Monad m mapStreamM :: Monad m
@ -189,7 +282,12 @@ escapeChar8 t = case B.find (\c -> c == newline || c == cr || c == comma || c ==
Nothing -> Escaped t Nothing -> Escaped t
Just _ -> escapeAlways t Just _ -> escapeAlways t
-- | This implementation is definitely suboptimal. textEscapeChar8 :: Text -> Escaped Text
textEscapeChar8 t = case T.find (\c -> c == '\n' || c == '\r' || c == ',' || c == '"') t of
Nothing -> Escaped t
Just _ -> textEscapeAlways t
-- This implementation is definitely suboptimal.
-- A better option (which would waste a little space -- A better option (which would waste a little space
-- but would be much faster) would be to build the -- but would be much faster) would be to build the
-- new bytestring by writing to a buffer directly. -- new bytestring by writing to a buffer directly.
@ -205,19 +303,18 @@ escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
t t
<> Builder.word8 doubleQuote <> Builder.word8 doubleQuote
-- | Specialized version of 'sepBy1'' which is faster due to not -- Suboptimal for similar reason as escapeAlways.
-- accepting an arbitrary separator. textEscapeAlways :: Text -> Escaped Text
sepByDelim1' :: AL.Parser a textEscapeAlways t = Escaped $ LT.toStrict $ TB.toLazyText $
-> Word8 -- ^ Field delimiter TB.singleton '"'
-> AL.Parser [a] <> T.foldl
sepByDelim1' p !delim = liftM2' (:) p loop (\ acc b -> acc <> if b == '"'
where then TB.fromString "\"\""
loop = do else TB.singleton b
mb <- A.peekWord8 )
case mb of mempty
Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop t
_ -> pure [] <> TB.singleton '"'
{-# INLINE sepByDelim1' #-}
-- | Parse a record, not including the terminating line separator. The -- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a -- terminating line separate is not included as the last record in a
@ -353,13 +450,6 @@ liftM2' f a b = do
{-# INLINE liftM2' #-} {-# INLINE liftM2' #-}
-- | Match either a single newline character @\'\\n\'@, or a carriage
-- return followed by a newline character @\"\\r\\n\"@, or a single
-- carriage return @\'\\r\'@.
endOfLine :: A.Parser ()
endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
{-# INLINE endOfLine #-}
doubleQuote, newline, cr, comma :: Word8 doubleQuote, newline, cr, comma :: Word8
doubleQuote = 34 doubleQuote = 34
newline = 10 newline = 10
@ -666,3 +756,20 @@ headed h f = SiphonAp (CE.Headed h) f (SiphonPure id)
indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a
indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id) indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id)
-- $setup
--
-- This code is copied from the head section. It has to be
-- run before every set of tests.
--
-- >>> :set -XOverloadedStrings
-- >>> import Siphon (Siphon)
-- >>> import Colonnade (Colonnade,Headed)
-- >>> import qualified Siphon as S
-- >>> import qualified Colonnade as C
-- >>> import qualified Data.Text as T
-- >>> import Data.Text (Text)
-- >>> import qualified Data.Text.Lazy.IO as LTIO
-- >>> import qualified Data.Text.Lazy.Builder as LB
-- >>> import Data.Maybe (fromMaybe)
-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}

7
siphon/test/Doctest.hs Normal file
View File

@ -0,0 +1,7 @@
import Test.DocTest
main :: IO ()
main = doctest
[ "src/Siphon.hs"
]

View File

@ -43,7 +43,7 @@ tests =
[ testGroup "ByteString encode/decode" [ testGroup "ByteString encode/decode"
[ testCase "Headed Encoding (int,char,bool)" [ testCase "Headed Encoding (int,char,bool)"
$ runTestScenario [(4,'c',False)] $ runTestScenario [(4,'c',False)]
S.encodeHeadedUtf8Csv S.encodeCsvStreamUtf8
encodingB encodingB
$ ByteString.concat $ ByteString.concat
[ "number,letter,boolean\n" [ "number,letter,boolean\n"
@ -51,7 +51,7 @@ tests =
] ]
, testCase "Headed Encoding (int,char,bool) monoidal building" , testCase "Headed Encoding (int,char,bool) monoidal building"
$ runTestScenario [(4,'c',False)] $ runTestScenario [(4,'c',False)]
S.encodeHeadedUtf8Csv S.encodeCsvStreamUtf8
encodingC encodingC
$ ByteString.concat $ ByteString.concat
[ "boolean,letter\n" [ "boolean,letter\n"
@ -59,7 +59,7 @@ tests =
] ]
, testCase "Headed Encoding (escaped characters)" , testCase "Headed Encoding (escaped characters)"
$ runTestScenario ["bob","there,be,commas","the \" quote"] $ runTestScenario ["bob","there,be,commas","the \" quote"]
S.encodeHeadedUtf8Csv S.encodeCsvStreamUtf8
encodingF encodingF
$ ByteString.concat $ ByteString.concat
[ "name\n" [ "name\n"
@ -99,7 +99,7 @@ tests =
, testProperty "Headed Isomorphism (int,char,bool)" , testProperty "Headed Isomorphism (int,char,bool)"
$ propIsoStream BC8.unpack $ propIsoStream BC8.unpack
(S.decodeHeadedUtf8Csv decodingB) (S.decodeHeadedUtf8Csv decodingB)
(S.encodeHeadedUtf8Csv encodingB) (S.encodeCsvStreamUtf8 encodingB)
] ]
] ]