Simple programming task: count lines

Solution overview

Let's try this:

*Main> countLines "/etc/rc.d"
4979
*Main> countLines "/usr/include"
*** Exception: /usr/include/dovecot/master-service-settings.h: 
openBinaryFile: resource exhausted (Too many open files)

Lazy I/O review

Fixing readFiles

How to implement recDir?

lazy recDir -- first attempt

recDir :: FilePath -> IO [FilePath]
recDir dir = do
ds <- openDirStream dir

let nextName = unsafeInterleaveIO $ readDirStream ds >>= checkName

checkName "" = closeDirStream ds >> return []
checkName "." = nextName
checkName ".." = nextName
checkName name = getSymbolicLinkStatus path >>= checkStat path
where path = dir </> name

checkStat path stat
| isRegularFile stat = liftM (path :) nextName
| isDirectory stat = liftM2 (++) (recDir path) nextName
| otherwise = nextName

nextName

testing recDir

*Main> countLines "/usr/include"
3774172
*Main> x <- recDir "/usr/include"
*Main> :!lsof -c ghc
...
ghc   9412   dm   7r  DIR  254,0 45056    15 /usr/include
*Main> length x
19568
*Main> :!lsof -c ghc
[gone]

so far so good, but...

*Main> x <- recDir "/etc"
*Main> length x
*** Exception: /etc/sudoers.d: openDirStream:
permission denied (Permission denied)
*Main> :!lsof -c ghc
...
ghc   9817   dm   7r  DIR  254,0 12288 146200 /etc
*Main> 

Oops... length threw an exception and now we've leaked a file descriptor!

lazy recDir -- second attempt

recDir2 :: FilePath -> IO [FilePath]
recDir2 dir = do
ds <- openDirStream dir
let protect m = m `onException` closeDirStream ds

nextName = unsafeInterleaveIO $
protect (readDirStream ds) >>= checkName

checkName "" = closeDirStream ds >> return []
checkName "." = nextName
checkName ".." = nextName
checkName name = getSymbolicLinkStatus path >>= checkStat path
where path = dir </> name

checkStat path stat
| isRegularFile stat = liftM (path :) nextName
| isDirectory stat =
liftM2 (++) (protect $ recDir2 path) nextName
| otherwise = nextName

nextName

Testing recDir2

*Main> x <- recDir2 "/etc"
*Main> length x
*** Exception: /etc/sudoers.d: openDirStream:
permission denied (Permission denied)
*Main> :!lsof -c ghc
[no leaked fd]

We've fixed one file descriptor leak, but exceptions at other times can still leak descriptors...

*Main> :!mkdir -p /tmp/perm/perm/perm; chmod 0 /tmp/perm/perm/perm
*Main> recDir2 "/tmp/perm"
*Main> :!lsof -c ghc
...
ghc  7337  dm    8r   DIR   0,17    60 82955 /tmp/perm
*Main> countLines2 "/etc"
*** Exception: /etc/avenger/dh1024.pem: openBinaryFile:
permission denied (Permission denied)
*Main> :!lsof -c ghc
...
ghc  8102  dm    7r   DIR  253,5 12288 393217 /etc
ghc  8102  dm    8r   DIR  253,5  4096 393227 /etc/avenger

Pitfalls of lazy I/O

Why does Haskell even have lazy I/O?

The iteratee abstraction [Kiselyov]

Representing iteratees

Example: Reading a line of input

readLine :: Iter (Maybe L.ByteString)
readLine = Iter (go L.empty)
where go acc (Chunk input eof)
| not (L.null b) = Done (Just acca) (Chunk btail eof)
| not eof = NeedInput (Iter (go acca))
| otherwise = Done Nothing (Chunk acca eof)
where (a, b) = L8.break (== '\n') input
acca = L.append acc a
btail = L.tail b

Enumerators

Running iteratees

Calling iteratees from other iteratees

Make Iter into a Monad!

instance Monad Iter where
return a = Iter $ Done a
m >>= k = Iter $ \c -> check (runIter m c)
where check (Done a c) = runIter (k a) c
check (NeedInput m') = NeedInput (m' >>= k)
check (NeedIO io) = NeedIO (liftM check io)
check (Failed e) = Failed e
fail msg = iterThrow (ErrorCall msg)

iterThrow :: (Exception e) => e -> Iter a
iterThrow e = Iter $ \_ -> Failed (toException e)

Counting lines revisited

The MonadIO class

More simple iteratees

-- Return chunk that is non-empty of has EOF set
iterChunk :: Iter Chunk
iterChunk = Iter $ \c@(Chunk buf eof) ->
if L.null buf && not eof
then NeedInput iterChunk
else Done c (Chunk L.empty eof)

-- Dump input to standard output
iterStdout :: Iter ()
iterStdout = do
(Chunk buf eof) <- iterChunk
liftIO $ L.putStr buf
unless eof iterStdout
*Main> enumerateFile "/etc/issue" iterStdout >>= getResult0
Arch Linux \r  (\n) (\l)

*Main>

Inner pipeline stages

Inum examples

Building pipelines

Exception handling

iterCatch :: Iter a -> (SomeException -> Iter a) -> Iter a
iterCatch (Iter f0) handler = Iter (check . f0)
where check (NeedInput (Iter f)) = NeedInput (Iter (check . f))
check (NeedIO io) = NeedIO (liftM check io)
check (Failed e) = NeedInput (handler e)
check done = done

onFailed :: Iter a -> Iter b -> Iter a
onFailed iter cleanup = iter `iterCatch` \e -> cleanup >> iterThrow e

iterBracket :: Iter a -> (a -> Iter b) -> (a -> Iter c) -> Iter c
iterBracket before after action = do
a <- before
b <- action a `onFailed` after a
after a
return b

inumBracket :: Iter a -> (a -> Iter b) -> (a -> Inum c) -> Inum c
inumBracket before after inum iter =
iterBracket before after (flip inum iter)

Simplifying Inum construction

Example: enumDir

enumDir :: FilePath -> Inum a
enumDir dir = inumBracket (liftIO $ openDirStream dir)
(liftIO . closeDirStream) $ \ds ->
let inum = runCodec nextName
nextName = liftIO (readDirStream ds) >>= checkName

checkName "" = return (L.empty, Nothing)
checkName "." = nextName
checkName ".." = nextName
checkName name = liftIO (getSymbolicLinkStatus path)
>>= checkStat path
where path = dir </> name

checkStat path stat
| isRegularFile stat =
return (L8.pack $ path ++ "\n", Just inum)
| isDirectory stat =
return (L.empty, Just $ enumDir path `cat` inum)
| otherwise = nextName
in inum
*Main> run $ enumDir "/etc/rc.d" .| xargsCat .| nlines1
4588

The MonadPlus class

Making Iter a MonadPlus

instance MonadPlus Iter where
mzero = fail "mzero"
mplus itera0 iterb = go mempty itera0
where go acc itera = Iter $ \c ->
let acc' = mappend acc c
check (NeedInput i) = NeedInput (go acc' i)
check (NeedIO io) = NeedIO (liftM check io)
check (Failed _) = runIter iterb acc'
check r = r
in check $ runIter itera c