Simple programming task: count lines

Solution overview

Let’s try this:

*Main> countLines "/etc/rc.d"
4979
*Main> countLines "/usr/include"
*** Exception: /usr/include/dovecot/master-service-settings.h: 
openBinaryFile: resource exhausted (Too many open files)

Lazy I/O introduction

Fixing readFiles

How to implement recDir?

lazy recDir – first attempt

recDir :: FilePath -> IO [FilePath]
recDir dir = do
  ds <- openDirStream dir

  let nextName = unsafeInterleaveIO $ readDirStream ds >>= checkName

      checkName "" = closeDirStream ds >> return []
      checkName "." = nextName
      checkName ".." = nextName
      checkName name = getSymbolicLinkStatus path >>= checkStat path
          where path = dir </> name

      checkStat path stat
          | isRegularFile stat = liftM (path :) nextName
          | isDirectory stat   = liftM2 (++) (recDir path) nextName
          | otherwise          = nextName

  nextName

testing recDir

*Main> countLines "/usr/include"
3774172
*Main> x <- recDir "/usr/include"
*Main> :!lsof -c ghc
...
ghc   9412   dm   7r  DIR  254,0 45056    15 /usr/include
*Main> length x
19568
*Main> :!lsof -c ghc
[gone]

so far so good, but…

*Main> x <- recDir "/etc"
*Main> length x
*** Exception: /etc/sudoers.d: openDirStream:
permission denied (Permission denied)
*Main> :!lsof -c ghc
...
ghc   9817   dm   7r  DIR  254,0 12288 146200 /etc
*Main> 

Oops… length threw an exception and now we’ve leaked a file descriptor!

lazy recDir – second attempt

recDir2 :: FilePath -> IO [FilePath]
recDir2 dir = do
  ds <- openDirStream dir
  let protect m = m `onException` closeDirStream ds

      nextName = unsafeInterleaveIO $
                 protect (readDirStream ds) >>= checkName

      checkName "" = closeDirStream ds >> return []
      checkName "." = nextName
      checkName ".." = nextName
      checkName name = getSymbolicLinkStatus path >>= checkStat path
          where path = dir </> name

      checkStat path stat
          | isRegularFile stat = liftM (path :) nextName
          | isDirectory stat =
              liftM2 (++) (protect $ recDir2 path) nextName
          | otherwise = nextName

  nextName

Testing recDir2

*Main> x <- recDir2 "/etc"
*Main> length x
*** Exception: /etc/sudoers.d: openDirStream:
permission denied (Permission denied)
*Main> :!lsof -c ghc
[no leaked fd]

We’ve fixed one file descriptor leak, but exceptions at other times can still leak descriptors…

*Main> :!mkdir -p /tmp/perm/perm/perm; chmod 0 /tmp/perm/perm/perm
*Main> recDir2 "/tmp/perm"
*Main> :!lsof -c ghc
...
ghc  7337  dm    8r   DIR   0,17    60 82955 /tmp/perm
*Main> countLines2 "/etc"
*** Exception: /etc/avenger/dh1024.pem: openBinaryFile:
permission denied (Permission denied)
*Main> :!lsof -c ghc
...
ghc  8102  dm    7r   DIR  253,5 12288 393217 /etc
ghc  8102  dm    8r   DIR  253,5  4096 393227 /etc/avenger

Pitfalls of lazy I/O

Why does Haskell even have lazy I/O?

The iteratee abstraction [Kiselyov]

Representing iteratees

Example: Reading a line of input

readLine :: Iter (Maybe L.ByteString)
readLine = Iter (go L.empty)
    where go acc (Chunk input eof)
              | not (L.null b) = Done (Just acca) (Chunk btail eof)
              | not eof        = NeedInput (Iter (go acca))
              | otherwise      = Done Nothing (Chunk acca eof)
              where (a, b) = L8.break (== '\n') input
                    acca = L.append acc a
                    btail = L.tail b

Enumerators

Running iteratees

Calling iteratees from other iteratees

Make Iter into a Monad!

instance Monad Iter where
    return a = Iter $ Done a
    m >>= k = Iter $ \c -> check (runIter m c)
        where check (Done a c)     = runIter (k a) c
              check (NeedInput m') = NeedInput (m' >>= k)
              check (NeedIO io)    = NeedIO (liftM check io)
              check (Failed e)     = Failed e
    fail msg = iterThrow (ErrorCall msg)

iterThrow :: (Exception e) => e -> Iter a
iterThrow e = Iter $ \_ -> Failed (toException e)

Counting lines revisited

The MonadIO class

More simple iteratees

-- Return chunk that is non-empty of has EOF set
iterChunk :: Iter Chunk
iterChunk = Iter $ \c@(Chunk buf eof) ->
            if L.null buf && not eof
            then NeedInput iterChunk
            else Done c (Chunk L.empty eof)

-- Dump input to standard output
iterStdout :: Iter ()
iterStdout = do
  (Chunk buf eof) <- iterChunk
  liftIO $ L.putStr buf
  unless eof iterStdout
*Main> enumerateFile "/etc/issue" iterStdout >>= getResult0
Arch Linux \r  (\n) (\l)

*Main>

Inner pipeline stages

Inum examples

Building pipelines

Exception handling

iterCatch :: Iter a -> (SomeException -> Iter a) -> Iter a
iterCatch (Iter f0) handler = Iter (check . f0)
    where check (NeedInput (Iter f)) = NeedInput (Iter (check . f))
          check (NeedIO io)          = NeedIO (liftM check io)
          check (Failed e)           = NeedInput (handler e)
          check done                 = done

onFailed :: Iter a -> Iter b -> Iter a
onFailed iter cleanup = iter `iterCatch` \e -> cleanup >> iterThrow e

iterBracket :: Iter a -> (a -> Iter b) -> (a -> Iter c) -> Iter c
iterBracket before after action = do
  a <- before
  b <- action a `onFailed` after a
  after a
  return b

inumBracket :: Iter a -> (a -> Iter b) -> (a -> Inum c) -> Inum c
inumBracket before after inum iter =
    iterBracket before after (flip inum iter)

Simplifying Inum construction

Example: enumDir

enumDir :: FilePath -> Inum a
enumDir dir = inumBracket (liftIO $ openDirStream dir)
              (liftIO . closeDirStream) $ \ds ->
  let inum = runCodec nextName
      nextName = liftIO (readDirStream ds) >>= checkName

      checkName "" = return (L.empty, Nothing)
      checkName "." = nextName
      checkName ".." = nextName
      checkName name = liftIO (getSymbolicLinkStatus path)
                       >>= checkStat path
          where path = dir </> name

      checkStat path stat
          | isRegularFile stat =
              return (L8.pack $ path ++ "\n", Just inum)
          | isDirectory stat =
              return (L.empty, Just $ enumDir path `cat` inum)
          | otherwise = nextName
  in inum
*Main> run $ enumDir "/etc/rc.d" .| xargsCat .| nlines1
4588

The MonadPlus class

Making Iter a MonadPlus

instance MonadPlus Iter where
    mzero = fail "mzero"
    mplus itera0 iterb = go mempty itera0
        where go acc itera = Iter $ \c ->
                  let acc' = mappend acc c
                      check (NeedInput i) = NeedInput (go acc' i)
                      check (NeedIO io) = NeedIO (liftM check io)
                      check (Failed _) = runIter iterb acc'
                      check r = r
                  in check $ runIter itera c