From e006ecd3361a9c8577e307b2b0e4971825c82b54 Mon Sep 17 00:00:00 2001 From: Dylan Simon Date: Tue, 9 May 2017 20:47:49 -0400 Subject: [PATCH] Streaming unzip conduit, initial version Untested, still many limitations on zip format --- .gitignore | 1 + Codec/Archive/Zip/Conduit/UnZip.hs | 132 +++++++++++++++++++++++++++++ LICENSE | 30 +++++++ Setup.hs | 2 + stack.yaml | 4 + zip-stream.cabal | 32 +++++++ 6 files changed, 201 insertions(+) create mode 100644 .gitignore create mode 100644 Codec/Archive/Zip/Conduit/UnZip.hs create mode 100644 LICENSE create mode 100644 Setup.hs create mode 100644 stack.yaml create mode 100644 zip-stream.cabal diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6fabf46 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.stack-work/ diff --git a/Codec/Archive/Zip/Conduit/UnZip.hs b/Codec/Archive/Zip/Conduit/UnZip.hs new file mode 100644 index 0000000..0028821 --- /dev/null +++ b/Codec/Archive/Zip/Conduit/UnZip.hs @@ -0,0 +1,132 @@ +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE RankNTypes #-} +module Codec.Archive.Zip.Conduit.UnZip + ( ZipEntry(..) + , ZipInfo(..) + , unZip + ) where + +import Control.Monad (when, unless) +import qualified Data.Binary.Get as G +import Data.Bits ((.&.), complement, shiftL, shiftR) +import qualified Data.ByteString as BS +import qualified Data.Conduit as C +import qualified Data.Conduit.List as CL +import Data.Conduit.Serialization.Binary (sinkGet) +import Data.Conduit.Zlib (WindowBits(..), decompress) +import Data.Digest.CRC32 (crc32Update) +import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..)) +import Data.Word (Word, Word32) + +data ZipEntry = ZipEntry + { zipEntryName :: BS.ByteString + , zipEntryTime :: UTCTime + , zipEntrySize :: Word + } + +data ZipInfo = ZipInfo + { zipComment :: BS.ByteString + } + +data Header m + = FileHeader + { fileDecompress :: C.Conduit BS.ByteString m BS.ByteString + , fileEntry :: !ZipEntry + , fileCRC :: !Word32 + , fileCSize :: !Word32 + } + | EndOfCentralDirectory + { endInfo :: ZipInfo + } + +crc32 :: Monad m => C.Consumer BS.ByteString m Word32 +crc32 = CL.fold crc32Update 0 + +checkCRC :: Monad m => Word32 -> C.Conduit BS.ByteString m BS.ByteString +checkCRC t = C.passthroughSink crc32 $ \r -> unless (r == t) $ fail "CRC32 mismatch" + +unZip :: C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) IO ZipInfo +unZip = next where + next = do + h <- sinkGet header + case h of + FileHeader{..} -> do + C.yield $ Left fileEntry + C.mapOutput Right $ pass (fromIntegral fileCSize) + C..| fileDecompress + C..| checkCRC fileCRC + next + EndOfCentralDirectory{..} -> do + return endInfo + header = do + sig <- G.getWord32le + case sig of + 0x04034b50 -> fileHeader + 0x08074b50 -> -- data descriptor + G.skip 12 >> header + _ -> centralDirectory sig + centralDirectory 0x02014b50 = centralHeader >> G.getWord32le >>= centralDirectory + centralDirectory 0x06054b50 = EndOfCentralDirectory <$> endDirectory + centralDirectory sig = fail $ "Unknown header signature: " ++ show sig + fileHeader = do + ver <- G.getWord16le + when (ver > 20) $ fail $ "Unsupported version: " ++ show ver + gpf <- G.getWord16le + when (gpf .&. complement 6 /= 0) $ fail $ "Unsupported flags: " ++ show gpf + comp <- G.getWord16le + dcomp <- case comp of + 0 -> return $ C.awaitForever C.yield + 8 -> return $ decompress (WindowBits (-15)) + _ -> fail $ "Unsupported compression method: " ++ show comp + time <- G.getWord16le + date <- G.getWord16le + let mtime = UTCTime (fromGregorian + (fromIntegral $ date `shiftR` 9 + 1980) + (fromIntegral $ date `shiftR` 5 .&. 0x0f) + (fromIntegral $ date .&. 0x1f) + ) + (timeOfDayToTime $ TimeOfDay + (fromIntegral $ time `shiftR` 11) + (fromIntegral $ time `shiftR` 5 .&. 0x3f) + (fromIntegral $ time `shiftL` 1 .&. 0x3f) + ) + crc <- G.getWord32le + csiz <- G.getWord32le + usiz <- G.getWord32le + nlen <- G.getWord16le + elen <- G.getWord16le + name <- G.getByteString $ fromIntegral nlen + G.skip $ fromIntegral elen + return FileHeader + { fileEntry = ZipEntry + { zipEntryName = name + , zipEntryTime = mtime + , zipEntrySize = fromIntegral usiz + } + , fileDecompress = dcomp + , fileCSize = csiz + , fileCRC = crc + } + centralHeader = do + -- ignore everything + G.skip 24 + nlen <- G.getWord16le + elen <- G.getWord16le + clen <- G.getWord16le + G.skip $ 12 + fromIntegral nlen + fromIntegral elen + fromIntegral clen + endDirectory = do + G.skip 16 + clen <- G.getWord16le + comm <- G.getByteString $ fromIntegral clen + return ZipInfo + { zipComment = comm + } + pass 0 = return () + pass n = C.await >>= maybe + (fail $ "EOF in file data, expecting " ++ show n ++ " more bytes") + (\b -> do + let (b', r) = BS.splitAt n b + C.yield b' + if BS.null r + then pass $ n - BS.length b' + else C.leftover r) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bb07a30 --- /dev/null +++ b/LICENSE @@ -0,0 +1,30 @@ +Copyright Dylan Simon (c) 2017 + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Dylan Simon nor the names of other + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/Setup.hs b/Setup.hs new file mode 100644 index 0000000..9a994af --- /dev/null +++ b/Setup.hs @@ -0,0 +1,2 @@ +import Distribution.Simple +main = defaultMain diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..e79a206 --- /dev/null +++ b/stack.yaml @@ -0,0 +1,4 @@ +resolver: lts-8.13 +packages: +- '.' +extra-deps: [] diff --git a/zip-stream.cabal b/zip-stream.cabal new file mode 100644 index 0000000..eef5d18 --- /dev/null +++ b/zip-stream.cabal @@ -0,0 +1,32 @@ +name: zip-stream +version: 0.1.0.0 +-- synopsis: +-- description: +homepage: https://github.com/dylex/zip-stream#readme +license: BSD3 +license-file: LICENSE +author: Dylan Simon +maintainer: dylan@dylex.net +copyright: 2017 +category: Codec +build-type: Simple +cabal-version: >=1.10 + +source-repository head + type: git + location: https://github.com/dylex/zip-stream + +library + exposed-modules: + Codec.Archive.Zip.Conduit.UnZip + default-language: Haskell2010 + ghc-options: -Wall + build-depends: + base >= 4.7 && < 5, + binary, + binary-conduit, + bytestring, + conduit, + conduit-extra, + digest, + time