Streaming unzip conduit, initial version

Untested, still many limitations on zip format
This commit is contained in:
Dylan Simon 2017-05-09 20:47:49 -04:00
commit e006ecd336
6 changed files with 201 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/.stack-work/

View File

@ -0,0 +1,132 @@
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE RankNTypes #-}
module Codec.Archive.Zip.Conduit.UnZip
( ZipEntry(..)
, ZipInfo(..)
, unZip
) where
import Control.Monad (when, unless)
import qualified Data.Binary.Get as G
import Data.Bits ((.&.), complement, shiftL, shiftR)
import qualified Data.ByteString as BS
import qualified Data.Conduit as C
import qualified Data.Conduit.List as CL
import Data.Conduit.Serialization.Binary (sinkGet)
import Data.Conduit.Zlib (WindowBits(..), decompress)
import Data.Digest.CRC32 (crc32Update)
import Data.Time (UTCTime(..), fromGregorian, timeOfDayToTime, TimeOfDay(..))
import Data.Word (Word, Word32)
data ZipEntry = ZipEntry
{ zipEntryName :: BS.ByteString
, zipEntryTime :: UTCTime
, zipEntrySize :: Word
}
data ZipInfo = ZipInfo
{ zipComment :: BS.ByteString
}
data Header m
= FileHeader
{ fileDecompress :: C.Conduit BS.ByteString m BS.ByteString
, fileEntry :: !ZipEntry
, fileCRC :: !Word32
, fileCSize :: !Word32
}
| EndOfCentralDirectory
{ endInfo :: ZipInfo
}
crc32 :: Monad m => C.Consumer BS.ByteString m Word32
crc32 = CL.fold crc32Update 0
checkCRC :: Monad m => Word32 -> C.Conduit BS.ByteString m BS.ByteString
checkCRC t = C.passthroughSink crc32 $ \r -> unless (r == t) $ fail "CRC32 mismatch"
unZip :: C.ConduitM BS.ByteString (Either ZipEntry BS.ByteString) IO ZipInfo
unZip = next where
next = do
h <- sinkGet header
case h of
FileHeader{..} -> do
C.yield $ Left fileEntry
C.mapOutput Right $ pass (fromIntegral fileCSize)
C..| fileDecompress
C..| checkCRC fileCRC
next
EndOfCentralDirectory{..} -> do
return endInfo
header = do
sig <- G.getWord32le
case sig of
0x04034b50 -> fileHeader
0x08074b50 -> -- data descriptor
G.skip 12 >> header
_ -> centralDirectory sig
centralDirectory 0x02014b50 = centralHeader >> G.getWord32le >>= centralDirectory
centralDirectory 0x06054b50 = EndOfCentralDirectory <$> endDirectory
centralDirectory sig = fail $ "Unknown header signature: " ++ show sig
fileHeader = do
ver <- G.getWord16le
when (ver > 20) $ fail $ "Unsupported version: " ++ show ver
gpf <- G.getWord16le
when (gpf .&. complement 6 /= 0) $ fail $ "Unsupported flags: " ++ show gpf
comp <- G.getWord16le
dcomp <- case comp of
0 -> return $ C.awaitForever C.yield
8 -> return $ decompress (WindowBits (-15))
_ -> fail $ "Unsupported compression method: " ++ show comp
time <- G.getWord16le
date <- G.getWord16le
let mtime = UTCTime (fromGregorian
(fromIntegral $ date `shiftR` 9 + 1980)
(fromIntegral $ date `shiftR` 5 .&. 0x0f)
(fromIntegral $ date .&. 0x1f)
)
(timeOfDayToTime $ TimeOfDay
(fromIntegral $ time `shiftR` 11)
(fromIntegral $ time `shiftR` 5 .&. 0x3f)
(fromIntegral $ time `shiftL` 1 .&. 0x3f)
)
crc <- G.getWord32le
csiz <- G.getWord32le
usiz <- G.getWord32le
nlen <- G.getWord16le
elen <- G.getWord16le
name <- G.getByteString $ fromIntegral nlen
G.skip $ fromIntegral elen
return FileHeader
{ fileEntry = ZipEntry
{ zipEntryName = name
, zipEntryTime = mtime
, zipEntrySize = fromIntegral usiz
}
, fileDecompress = dcomp
, fileCSize = csiz
, fileCRC = crc
}
centralHeader = do
-- ignore everything
G.skip 24
nlen <- G.getWord16le
elen <- G.getWord16le
clen <- G.getWord16le
G.skip $ 12 + fromIntegral nlen + fromIntegral elen + fromIntegral clen
endDirectory = do
G.skip 16
clen <- G.getWord16le
comm <- G.getByteString $ fromIntegral clen
return ZipInfo
{ zipComment = comm
}
pass 0 = return ()
pass n = C.await >>= maybe
(fail $ "EOF in file data, expecting " ++ show n ++ " more bytes")
(\b -> do
let (b', r) = BS.splitAt n b
C.yield b'
if BS.null r
then pass $ n - BS.length b'
else C.leftover r)

30
LICENSE Normal file
View File

@ -0,0 +1,30 @@
Copyright Dylan Simon (c) 2017
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following
disclaimer in the documentation and/or other materials provided
with the distribution.
* Neither the name of Dylan Simon nor the names of other
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

2
Setup.hs Normal file
View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

4
stack.yaml Normal file
View File

@ -0,0 +1,4 @@
resolver: lts-8.13
packages:
- '.'
extra-deps: []

32
zip-stream.cabal Normal file
View File

@ -0,0 +1,32 @@
name: zip-stream
version: 0.1.0.0
-- synopsis:
-- description:
homepage: https://github.com/dylex/zip-stream#readme
license: BSD3
license-file: LICENSE
author: Dylan Simon
maintainer: dylan@dylex.net
copyright: 2017
category: Codec
build-type: Simple
cabal-version: >=1.10
source-repository head
type: git
location: https://github.com/dylex/zip-stream
library
exposed-modules:
Codec.Archive.Zip.Conduit.UnZip
default-language: Haskell2010
ghc-options: -Wall
build-depends:
base >= 4.7 && < 5,
binary,
binary-conduit,
bytestring,
conduit,
conduit-extra,
digest,
time