redo interface to siphon

2017-12-14 22:30:01 -05:00 · 2017-12-14 22:30:01 -05:00 · f115e7798b
commit f115e7798b
parent 4f3e83a908
4 changed files with 176 additions and 52 deletions
--- a/siphon/siphon.cabal
+++ b/siphon/siphon.cabal
@ -1,5 +1,5 @@
 name: siphon
-version: 0.7.2
+version: 0.8.0
 synopsis: Encode and decode CSV files
 description: Please see README.md
 homepage: https://github.com/andrewthad/colonnade#readme
@ -19,13 +19,23 @@ library
    Siphon.Types
  build-depends:
      base >= 4.9 && < 5
-    , colonnade >= 1.1 && < 1.3
-    , text
+    , colonnade >= 1.1.1 && < 1.3
+    , text >= 1.0 && < 1.3
    , bytestring
    , vector
-    , streaming
+    , streaming >= 0.1.4 && < 0.3
    , attoparsec
-    , transformers
+    , transformers >= 0.5 && < 0.6
+  default-language:    Haskell2010
+
+test-suite doctest
+  type: exitcode-stdio-1.0
+  hs-source-dirs: test
+  main-is: Doctest.hs
+  build-depends:
+      base
+    , siphon
+    , doctest >= 0.10
  default-language:    Haskell2010

 test-suite siphon-test
--- a/siphon/src/Siphon.hs
+++ b/siphon/src/Siphon.hs
@ -3,18 +3,43 @@
 {-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE RankNTypes #-}

-- {-# OPTIONS_GHC -Wall -Werr -fno-warn-unused-imports #-}
+{-# OPTIONS_GHC -Wall -fno-warn-unused-imports #-}

+-- | Build CSVs using the abstractions provided in the @colonnade@ library, and 
+--   parse CSVs using 'Siphon', which is the dual of 'Colonnade'.
+--   Read the documentation for @colonnade@ before reading the documentation
+--   for @siphon@. All of the examples on this page assume the following
+--   setup:
+--
+--   >>> :set -XOverloadedStrings
+--   >>> import Siphon (Siphon)
+--   >>> import Colonnade (Colonnade,Headed)
+--   >>> import qualified Siphon as S
+--   >>> import qualified Colonnade as C
+--   >>> import qualified Data.Text as T
+--   >>> import qualified Data.Text.Lazy.IO as LTIO
+--   >>> import qualified Data.Text.Lazy.Builder as LB
+--   >>> import Data.Text (Text)
+--   >>> import Data.Maybe (fromMaybe)
+--   >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
 module Siphon
-  ( Siphon
-  , SiphonError
-  , Indexed(..)
+  ( -- * Encode CSV
+    encodeCsv
+  , encodeCsvStream
+  , encodeCsvUtf8
+  , encodeCsvStreamUtf8
+    -- * Decode CSV
  , decodeHeadedUtf8Csv
-  , encodeHeadedUtf8Csv
-  , humanizeSiphonError
+    -- * Build Siphon
  , headed
  , headless
  , indexed
+    -- * Types
+  , Siphon
+  , SiphonError
+  , Indexed(..)
+    -- * Utility
+  , humanizeSiphonError
  ) where

 import Siphon.Types
@ -32,6 +57,8 @@ import qualified Data.Vector as V
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as LByteString
 import qualified Data.ByteString.Builder as Builder
+import qualified Data.Text.Lazy as LT
+import qualified Data.Text.Lazy.Builder as TB
 import qualified Data.Text as T
 import qualified Data.List as L
 import qualified Streaming as SM
@ -39,9 +66,10 @@ import qualified Streaming.Prelude as SMP
 import qualified Data.Attoparsec.Types as ATYP
 import qualified Colonnade.Encode as CE
 import qualified Data.Vector.Mutable as MV
-
+import qualified Data.ByteString.Builder as BB

 import Control.Monad.Trans.Class
+import Data.Functor.Identity (Identity(..))
 import Data.ByteString.Builder (toLazyByteString,byteString)
 import Data.Attoparsec.ByteString.Char8 (char, endOfInput, string)
 import Data.Word (Word8)
@ -53,6 +81,7 @@ import Data.Text.Encoding (decodeUtf8')
 import Streaming (Stream,Of(..))
 import Data.Vector.Mutable (MVector)
 import Control.Monad.ST
+import Data.Text (Text)

 newtype Escaped c = Escaped { getEscaped :: c }
 data Ended = EndedYes | EndedNo
@ -74,40 +103,104 @@ decodeHeadedUtf8Csv headedSiphon s1 = do
        let requiredLength = V.length v
        consumeBodyUtf8 1 requiredLength ixedSiphon s2

-encodeHeadedUtf8Csv :: Monad m 
-  => CE.Colonnade CE.Headed a ByteString
+encodeCsvStreamUtf8 :: (Monad m, CE.Headedness h)
+  => CE.Colonnade h a ByteString
  -> Stream (Of a) m r
  -> Stream (Of ByteString) m r
-encodeHeadedUtf8Csv =
-  encodeHeadedCsv escapeChar8 (B.singleton comma) (B.singleton newline)
+encodeCsvStreamUtf8 =
+  encodeCsvInternal escapeChar8 (B.singleton comma) (B.singleton newline)

-encodeHeadedCsv :: Monad m
+encodeCsvStream :: (Monad m, CE.Headedness h)
+  => CE.Colonnade h a Text
+  -> Stream (Of a) m r
+  -> Stream (Of Text) m r
+encodeCsvStream =
+  encodeCsvInternal textEscapeChar8 (T.singleton ',') (T.singleton '\n')
+
+-- | Encode a collection to a CSV as a text 'TB.Builder'. For example,
+--   we can take the following columnar encoding of a person:
+--
+-- >>> :{
+-- let colPerson :: Colonnade Headed Person Text
+--     colPerson = mconcat
+--       [ C.headed "Name" name
+--       , C.headed "Age" (T.pack . show . age)
+--       , C.headed "Company" (fromMaybe "N/A" . company)
+--       ]
+-- :}
+--
+-- And we have the following people whom we wish to encode
+-- in this way:
+--
+-- >>> :{
+-- let people :: [Person]
+--     people =
+--       [ Person "Chao" 26 (Just "Tectonic, Inc.")
+--       , Person "Elsie" 41 (Just "Globex Corporation")
+--       , Person "Arabella" 19 Nothing
+--       ]
+-- :}
+--
+-- We pair the encoding with the rows to get a CSV:
+--
+-- >>> LTIO.putStr (TB.toLazyText (encodeCsv colPerson people))
+-- Name,Age,Company
+-- Chao,26,"Tectonic, Inc."
+-- Elsie,41,Globex Corporation
+-- Arabella,19,N/A
+encodeCsv :: (Foldable f, CE.Headedness h)
+  => CE.Colonnade h a Text -- ^ Tablular encoding
+  -> f a -- ^ Value of each row
+  -> TB.Builder
+encodeCsv enc = 
+  textStreamToBuilder . encodeCsvStream enc . SMP.each
+
+-- | Encode a collection to a CSV as a bytestring 'BB.Builder'.
+encodeCsvUtf8 :: (Foldable f, CE.Headedness h)
+  => CE.Colonnade h a ByteString -- ^ Tablular encoding
+  -> f a -- ^ Value of each row
+  -> BB.Builder
+encodeCsvUtf8 enc =
+  streamToBuilder . encodeCsvStreamUtf8 enc . SMP.each
+
+streamToBuilder :: Stream (Of ByteString) Identity () -> BB.Builder
+streamToBuilder s = SM.destroy s
+  (\(bs :> bb) -> BB.byteString bs <> bb) runIdentity (\() -> mempty)
+
+textStreamToBuilder :: Stream (Of Text) Identity () -> TB.Builder
+textStreamToBuilder s = SM.destroy s
+  (\(bs :> bb) -> TB.fromText bs <> bb) runIdentity (\() -> mempty)
+
+encodeCsvInternal :: (Monad m, CE.Headedness h)
  => (c -> Escaped c)
  -> c -- ^ separator
  -> c -- ^ newline
-  -> CE.Colonnade CE.Headed a c
+  -> CE.Colonnade h a c
  -> Stream (Of a) m r
  -> Stream (Of c) m r
-encodeHeadedCsv escapeFunc separatorStr newlineStr colonnade s = do
-  encodeHeader escapeFunc separatorStr newlineStr colonnade
+encodeCsvInternal escapeFunc separatorStr newlineStr colonnade s = do
+  case CE.headednessExtract of
+    Just toContent -> encodeHeader toContent escapeFunc separatorStr newlineStr colonnade
+    Nothing -> return ()
  encodeRows escapeFunc separatorStr newlineStr colonnade s

 encodeHeader :: Monad m
-  => (c -> Escaped c)
+  => (h c -> c)
+  -> (c -> Escaped c)
  -> c -- ^ separator
  -> c -- ^ newline
-  -> CE.Colonnade CE.Headed a c
+  -> CE.Colonnade h a c
  -> Stream (Of c) m ()
-encodeHeader escapeFunc separatorStr newlineStr colonnade = do
+encodeHeader toContent escapeFunc separatorStr newlineStr colonnade = do
  let (vs,ws) = V.splitAt 1 (CE.getColonnade colonnade)
  -- we only need to do this split because the first cell
  -- gets treated differently than the others. It does not
  -- get a separator added before it.
-  V.forM_ vs $ \(CE.OneColonnade (CE.Headed h) _) -> do
-    SMP.yield (getEscaped (escapeFunc h))
-  V.forM_ ws $ \(CE.OneColonnade (CE.Headed h) _) -> do
+  V.forM_ vs $ \(CE.OneColonnade h _) -> do
+    SMP.yield (getEscaped (escapeFunc (toContent h)))
+  V.forM_ ws $ \(CE.OneColonnade h _) -> do
    SMP.yield separatorStr
-    SMP.yield (getEscaped (escapeFunc h))
+    SMP.yield (getEscaped (escapeFunc (toContent h)))
  SMP.yield newlineStr

 mapStreamM :: Monad m
@ -189,7 +282,12 @@ escapeChar8 t = case B.find (\c -> c == newline || c == cr || c == comma || c ==
  Nothing -> Escaped t
  Just _  -> escapeAlways t

-- | This implementation is definitely suboptimal.
+textEscapeChar8 :: Text -> Escaped Text
+textEscapeChar8 t = case T.find (\c -> c == '\n' || c == '\r' || c == ',' || c == '"') t of
+  Nothing -> Escaped t
+  Just _  -> textEscapeAlways t
+
+-- This implementation is definitely suboptimal.
 -- A better option (which would waste a little space
 -- but would be much faster) would be to build the
 -- new bytestring by writing to a buffer directly.
@ -205,19 +303,18 @@ escapeAlways t = Escaped $ LByteString.toStrict $ Builder.toLazyByteString $
      t
  <> Builder.word8 doubleQuote

-- | Specialized version of 'sepBy1'' which is faster due to not
-- accepting an arbitrary separator.
-sepByDelim1' :: AL.Parser a
-             -> Word8  -- ^ Field delimiter
-             -> AL.Parser [a]
-sepByDelim1' p !delim = liftM2' (:) p loop
-  where
-    loop = do
-        mb <- A.peekWord8
-        case mb of
-            Just b | b == delim -> liftM2' (:) (A.anyWord8 *> p) loop
-            _                   -> pure []
-{-# INLINE sepByDelim1' #-}
+-- Suboptimal for similar reason as escapeAlways.
+textEscapeAlways :: Text -> Escaped Text
+textEscapeAlways t = Escaped $ LT.toStrict $ TB.toLazyText $
+     TB.singleton '"'
+  <> T.foldl
+      (\ acc b -> acc <> if b == '"'
+          then TB.fromString "\"\""
+          else TB.singleton b
+      )
+      mempty
+      t
+  <> TB.singleton '"'

 -- | Parse a record, not including the terminating line separator. The
 -- terminating line separate is not included as the last record in a
@ -353,13 +450,6 @@ liftM2' f a b = do
 {-# INLINE liftM2' #-}


-- | Match either a single newline character @\'\\n\'@, or a carriage
-- return followed by a newline character @\"\\r\\n\"@, or a single
-- carriage return @\'\\r\'@.
-endOfLine :: A.Parser ()
-endOfLine = (A.word8 newline *> return ()) <|> (string (BC8.pack "\r\n") *> return ()) <|> (A.word8 cr *> return ())
-{-# INLINE endOfLine #-}
-
 doubleQuote, newline, cr, comma :: Word8
 doubleQuote = 34
 newline = 10
@ -666,3 +756,20 @@ headed h f = SiphonAp (CE.Headed h) f (SiphonPure id)
 indexed :: Int -> (c -> Maybe a) -> Siphon Indexed c a
 indexed ix f = SiphonAp (Indexed ix) f (SiphonPure id)

+-- $setup
+--
+-- This code is copied from the head section. It has to be
+-- run before every set of tests.
+--
+-- >>> :set -XOverloadedStrings
+-- >>> import Siphon (Siphon)
+-- >>> import Colonnade (Colonnade,Headed)
+-- >>> import qualified Siphon as S
+-- >>> import qualified Colonnade as C
+-- >>> import qualified Data.Text as T
+-- >>> import Data.Text (Text)
+-- >>> import qualified Data.Text.Lazy.IO as LTIO
+-- >>> import qualified Data.Text.Lazy.Builder as LB
+-- >>> import Data.Maybe (fromMaybe)
+-- >>> data Person = Person { name :: Text, age :: Int, company :: Maybe Text}
+
--- a/siphon/test/Doctest.hs
+++ b/siphon/test/Doctest.hs
@ -0,0 +1,7 @@
+import Test.DocTest
+
+main :: IO ()
+main = doctest
+  [ "src/Siphon.hs"
+  ]
+
--- a/siphon/test/Test.hs
+++ b/siphon/test/Test.hs
@ -43,7 +43,7 @@ tests =
  [ testGroup "ByteString encode/decode"
    [ testCase "Headed Encoding (int,char,bool)"
        $ runTestScenario [(4,'c',False)]
-            S.encodeHeadedUtf8Csv
+            S.encodeCsvStreamUtf8
            encodingB
            $ ByteString.concat
              [ "number,letter,boolean\n"
@ -51,7 +51,7 @@ tests =
              ]
    , testCase "Headed Encoding (int,char,bool) monoidal building"
        $ runTestScenario [(4,'c',False)]
-            S.encodeHeadedUtf8Csv
+            S.encodeCsvStreamUtf8
            encodingC
            $ ByteString.concat
              [ "boolean,letter\n"
@ -59,7 +59,7 @@ tests =
              ]
    , testCase "Headed Encoding (escaped characters)"
        $ runTestScenario ["bob","there,be,commas","the \" quote"]
-            S.encodeHeadedUtf8Csv
+            S.encodeCsvStreamUtf8
            encodingF
            $ ByteString.concat
              [ "name\n"
@ -99,7 +99,7 @@ tests =
    , testProperty "Headed Isomorphism (int,char,bool)"
        $ propIsoStream BC8.unpack
          (S.decodeHeadedUtf8Csv decodingB)
-          (S.encodeHeadedUtf8Csv encodingB)
+          (S.encodeCsvStreamUtf8 encodingB)
    ]
  ]