Plan 9 from Bell Labs’s /usr/web/sources/contrib/fernan/nhc98/src/libraries/HaXml/bugs/panitz/XmlLex.hs

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


module XmlLex
  ( xmlLex         -- :: String -> String -> [Token]
  , xmlReLex       -- :: Posn   -> String -> [Token]
  , posInNewCxt    -- :: String -> Posn
  , Posn(..)
  , TokenT(..)
  , Token
  , Special(..)
  , Section(..)
  ) where

-- This is a hand-written lexer for tokenising the text of an XML
-- document so that it is ready for parsing.  It attaches position
-- information in (line,column) format to every token.  The main
-- entry point is xmlLex.  A secondary entry point, xmlReLex, is
-- provided for when the parser needs to stuff a string back onto
-- the front of the text and re-tokenise it (typically when expanding
-- macros).
--
-- As one would expect, the lexer is essentially a small finite
-- state machine.



import Prelude
import Char

import XmlChar



data Where = InTag | NotInTag
    deriving (Eq)

type Token = (Posn, TokenT)

data Posn = Pn String Int Int (Maybe Posn) -- filename, line, column, incl.point
        deriving (Eq)

instance Show Posn where
      showsPrec p (Pn f l c i) = showString f .
                                 showString "  at line " . shows l .
                                 showString " col " . shows c .
                                 ( case i of
                                    Nothing -> id
                                    Just p  -> showString "\n    used by  " .
                                               shows p )

data TokenT =
      TokCommentOpen		--     <!--
    | TokCommentClose		--     -->
    | TokPIOpen			--     <?
    | TokPIClose		--     ?>
    | TokSectionOpen		--     <![
    | TokSectionClose		--     ]]>
    | TokSection Section	--     CDATA INCLUDE IGNORE etc
    | TokSpecialOpen		--     <!
    | TokSpecial Special	--     DOCTYPE ELEMENT ATTLIST etc
    | TokEndOpen		--     </
    | TokEndClose		--     />
    | TokAnyOpen		--     <
    | TokAnyClose		--     >
    | TokSqOpen			--     [
    | TokSqClose		--     ]
    | TokEqual			--     =
    | TokQuery			--     ?
    | TokStar			--     *
    | TokPlus			--     +
    | TokAmp			--     &
    | TokSemi			--     ;
    | TokHash			--     #
    | TokBraOpen		--     (
    | TokBraClose		--     )
    | TokPipe			--     |
    | TokPercent		--     %
    | TokComma			--     ,
    | TokQuote			--     '' or ""
    | TokName      String	--     begins with letter
    | TokFreeText  String	--     any character data
    | TokNull			--     fake token
    deriving (Eq)

data Special =
      DOCTYPEx
    | ELEMENTx
    | ATTLISTx
    | ENTITYx
    | NOTATIONx
    deriving (Eq,Show)
data Section =
      CDATAx
    | INCLUDEx
    | IGNOREx
    deriving (Eq,Show)

instance Show TokenT where
  showsPrec p  TokCommentOpen		= showString     "<!--"
  showsPrec p  TokCommentClose		= showString     "-->"
  showsPrec p  TokPIOpen		= showString     "<?"
  showsPrec p  TokPIClose		= showString     "?>"
  showsPrec p  TokSectionOpen		= showString     "<!["
  showsPrec p  TokSectionClose		= showString     "]]>"
  showsPrec p  (TokSection s)		= showsPrec p s
  showsPrec p  TokSpecialOpen		= showString     "<!"
  showsPrec p  (TokSpecial s)		= showsPrec p s
  showsPrec p  TokEndOpen		= showString     "</"
  showsPrec p  TokEndClose		= showString     "/>"
  showsPrec p  TokAnyOpen		= showString     "<"
  showsPrec p  TokAnyClose		= showString     ">"
  showsPrec p  TokSqOpen		= showString     "["
  showsPrec p  TokSqClose		= showString     "]"
  showsPrec p  TokEqual			= showString     "="
  showsPrec p  TokQuery			= showString     "?"
  showsPrec p  TokStar			= showString     "*"
  showsPrec p  TokPlus			= showString     "+"
  showsPrec p  TokAmp			= showString     "&"
  showsPrec p  TokSemi			= showString     ";"
  showsPrec p  TokHash			= showString     "#"
  showsPrec p  TokBraOpen		= showString     "("
  showsPrec p  TokBraClose		= showString     ")"
  showsPrec p  TokPipe			= showString     "|"
  showsPrec p  TokPercent		= showString     "%"
  showsPrec p  TokComma			= showString     ","
  showsPrec p  TokQuote			= showString     "' or \""
  showsPrec p  (TokName      s)		= showString     s
  showsPrec p  (TokFreeText  s)		= showString     s
  showsPrec p  TokNull			= showString     "(null)"

--trim, revtrim :: String -> String
--trim    = f . f         where f = reverse . dropWhile isSpace
--revtrim = f.reverse.f   where f = dropWhile isSpace
revtrim = reverse . dropWhile (=='\n')

emit :: TokenT -> Posn -> Token
emit tok p = forcep p `seq` (p,tok)

forcep (Pn f n m i) = m `seq` n

lexerror :: String -> Posn -> a
lexerror s p = error ("Lexical error in  "++show p++": "++s++"\n")

addcol :: Int -> Posn -> Posn
addcol n (Pn f r c i) = Pn f r (c+n) i

newline, tab :: Posn -> Posn
newline (Pn f r c i) = Pn f (r+1) 1 i
tab     (Pn f r c i) = Pn f r (((c`div`8)+1)*8) i

white :: Char -> Posn -> Posn
white ' '  = addcol 1
white '\n' = newline
white '\r' = id
white '\t' = tab
white '\xa0' = addcol 1

skip :: Int -> Posn -> String -> (Posn->String->[Token]) -> [Token]
skip n p s k = k (addcol n p) (drop n s)

blank :: ([Where]->Posn->String->[Token]) -> [Where]-> Posn-> String-> [Token]
blank k    (InTag:_) p [] = lexerror "unexpected EOF in tag" p
blank k          _   p [] = []
blank k      w p (' ': s) = blank k w (addcol 1 p) s
blank k      w p ('\t':s) = blank k w (tab p) s
blank k      w p ('\n':s) = blank k w (newline p) s
blank k      w p ('\r':s) = blank k w  p s
blank k   w p ('\xa0': s) = blank k w (addcol 1 p) s
blank k      w p    s     = k w p s

prefixes :: String -> String -> Bool
[]     `prefixes`   ys   = True
(x:xs) `prefixes` (y:ys) = x==y && xs `prefixes` ys
(x:xs) `prefixes`   []   = False --error "unexpected EOF in prefix"

accumulateUntil (c:cs) tok acc pos  p  [] k =
    lexerror ("unexpected EOF while looking for "++c:cs++" after "++show pos) p
accumulateUntil (c:cs) tok acc pos  p (s:ss) k
    | c==s && cs `prefixes` ss  = emit (TokFreeText (reverse acc)) pos:
                                  emit tok p: skip (length cs) p ss k
    | isSpace s  = accumulateUntil (c:cs) tok (s:acc) pos (white s p) ss k
    | otherwise  = accumulateUntil (c:cs) tok (s:acc) pos (addcol 1 p) ss k

----
posInNewCxt :: String -> Maybe Posn -> Posn
posInNewCxt name pos = Pn name 1 1 pos

xmlLex :: String -> String -> [Token]
xmlLex filename = xmlAny [] (posInNewCxt ("file "++filename) Nothing)

xmlReLex :: Posn -> String -> [Token]
xmlReLex p s
      | "INCLUDE"  `prefixes` s  = emit (TokSection INCLUDEx) p: k 7
      | "IGNORE"   `prefixes` s  = emit (TokSection IGNOREx) p:  k 6
      | otherwise = blank xmlAny [] p s
  where
    k n = skip n p s (blank xmlAny [])

--xmltop :: Posn -> String -> [Token]
--xmltop p [] = []
--xmltop p s
--    | "<?"   `prefixes` s  = emit TokPIOpen p:      next 2 (xmlPI [InTag])
--    | "<!--" `prefixes` s  = emit TokCommentOpen p: next 4 (xmlComment [])
--    | "<!"   `prefixes` s  = emit TokSpecialOpen p: next 2 (xmlSpecial [InTag])
--    | otherwise            = lexerror "expected <?xml?> or <!DOCTYPE>" p
--  where next n k = skip n p s k

xmlPI      w p s = xmlName p s (blank xmlPIEnd w)
xmlPIEnd   w p s = accumulateUntil "?>"  TokPIClose "" p p s
                                                      (blank xmlAny (tail w))
xmlComment w p s = accumulateUntil "-->" TokCommentClose "" p p s
                                                             (blank xmlAny w)

-- Note: the order of the clauses in xmlAny is very important.
-- Some matches must precede the NotInTag test, the rest must follow it.
xmlAny :: [Where] -> Posn -> String -> [Token]
xmlAny    (InTag:_)  p [] = lexerror "unexpected EOF inside tag" p
xmlAny          _    p [] = []
xmlAny w p s@('<':ss)
    | "?"   `prefixes` ss = emit TokPIOpen p:      skip 2 p s (xmlPI (InTag:w))
    | "!--" `prefixes` ss = emit TokCommentOpen p: skip 4 p s (xmlComment w)
    | "!["  `prefixes` ss = emit TokSectionOpen p: skip 3 p s (xmlSection w)
    | "!"   `prefixes` ss = emit TokSpecialOpen p:
                                              skip 2 p s (xmlSpecial (InTag:w))
    | "/"   `prefixes` ss = emit TokEndOpen p:
                                             skip 2 p s (xmlTag (InTag:tail w))
    | otherwise           = emit TokAnyOpen p:
                                         skip 1 p s (xmlTag (InTag:NotInTag:w))
xmlAny (_:_:w) p s@('/':ss)
    | ">"   `prefixes` ss = emit TokEndClose p: skip 2 p s (xmlAny w)
xmlAny w p ('&':ss) = emit TokAmp p:      accumulateUntil ";" TokSemi "" p
                                                     (addcol 1 p) ss (xmlAny w)
xmlAny w@(NotInTag:_) p s = xmlContent "" w p p s
xmlAny w p ('>':ss) = emit TokAnyClose p:       xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('[':ss) = emit TokSqOpen p:   blank xmlAny (InTag:w) (addcol 1 p) ss
xmlAny w p (']':ss)
    | "]>" `prefixes` ss  =
                 emit TokSectionClose p:  skip 3 p (']':ss) (xmlAny (tail w))
    | otherwise  =    emit TokSqClose p:  blank xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('(':ss) = emit TokBraOpen p:  blank xmlAny (InTag:w) (addcol 1 p) ss
xmlAny w p (')':ss) = emit TokBraClose p: blank xmlAny (tail w) (addcol 1 p) ss
xmlAny w p ('=':ss) = emit TokEqual p:    blank xmlAny w (addcol 1 p) ss
xmlAny w p ('*':ss) = emit TokStar p:     blank xmlAny w (addcol 1 p) ss
xmlAny w p ('+':ss) = emit TokPlus p:     blank xmlAny w (addcol 1 p) ss
xmlAny w p ('?':ss) = emit TokQuery p:    blank xmlAny w (addcol 1 p) ss
xmlAny w p ('|':ss) = emit TokPipe p:     blank xmlAny w (addcol 1 p) ss
xmlAny w p ('%':ss) = emit TokPercent p:  blank xmlAny w (addcol 1 p) ss
xmlAny w p (';':ss) = emit TokSemi p:     blank xmlAny w (addcol 1 p) ss
xmlAny w p (',':ss) = emit TokComma p:    blank xmlAny w (addcol 1 p) ss
xmlAny w p ('#':ss) = emit TokHash p:     blank xmlAny w (addcol 1 p) ss
xmlAny w p ('"':ss) = emit TokQuote p:    accumulateUntil "\"" TokQuote "" p
                                                     (addcol 1 p) ss (xmlAny w)
xmlAny w p ('\'':ss) = emit TokQuote p:   accumulateUntil "'" TokQuote "" p
                                                     (addcol 1 p) ss (xmlAny w)
xmlAny w p s
    | isSpace (head s)     = blank xmlAny w p s
    | isNmstart (head s)   = xmlName p s (blank xmlAny w)
    | otherwise            = lexerror "unrecognised token" p

xmlTag w p s = xmlName p s (blank xmlAny w)

xmlSection = blank xmlSection0
  where
    xmlSection0 w p s
      | "CDATA["   `prefixes` s  = emit (TokSection CDATAx) p:  accum w p s 6
      | "INCLUDE"  `prefixes` s  = emit (TokSection INCLUDEx) p:    k w p s 7
      | "IGNORE"   `prefixes` s  = emit (TokSection IGNOREx) p:     k w p s 6
      | "%"        `prefixes` s  = emit TokPercent p:               k w p s 1
      | otherwise = lexerror ("expected CDATA, IGNORE, or INCLUDE") p
    accum w p s n =
      let p0 = addcol n p in
      accumulateUntil "]]>" TokSectionClose "" p0 p0 (drop n s) (blank xmlAny w)
    k w p s n =
      skip n p s (xmlAny w)

xmlSpecial w p s
    | "DOCTYPE"  `prefixes` s = emit (TokSpecial DOCTYPEx) p: k 7
    | "ELEMENT"  `prefixes` s = emit (TokSpecial ELEMENTx) p: k 7
    | "ATTLIST"  `prefixes` s = emit (TokSpecial ATTLISTx) p: k 7
    | "ENTITY"   `prefixes` s = emit (TokSpecial ENTITYx) p:  k 6
    | otherwise = lexerror "expected DOCTYPE, ELEMENT, ENTITY, or ATTLIST" p
  where k n = skip n p s (blank xmlAny w)

xmlName p (s:ss) k
    | isNmstart s  = gatherName (s:[]) p (addcol 1 p) ss k
    -- | isAlphaNum s || s==':' || s=='_'  = gatherName (s:[]) p (addcol 1 p) ss k
    | otherwise                         = lexerror ((show$ord s) ++"    expected name") p
  where
    gatherName acc pos p [] k =
        emit (TokName (reverse acc)) pos: k p []
    --  lexerror ("unexpected EOF in name at "++show pos) p
    gatherName acc pos p (s:ss) k
--        | isAlphaNum s || s `elem` ".-_:"
        | isNmchar s|| s `elem` ".-_:"
                      = gatherName (s:acc) pos (addcol 1 p) ss k
        | otherwise   = emit (TokName (reverse acc)) pos: k p (s:ss)

xmlContent acc w pos p [] = if all isSpace acc then []
                            else lexerror "unexpected EOF between tags" p
xmlContent acc w pos p (s:ss)
    | elem s "<&"    = if all isSpace acc then xmlAny w p (s:ss)
                       else emit (TokFreeText (revtrim acc)) pos: xmlAny w p (s:ss)
    | isSpace s      = xmlContent (s:acc) w pos (white s p) ss
    | otherwise      = xmlContent (s:acc) w pos (addcol 1 p) ss



--ident :: (String->TokenT) ->
--          Posn -> String -> [String] ->
--         (Posn->String->[String]->[Token]) -> [Token]
--ident tok p s ss k =
--    let (name,s0) = span (\c-> isAlphaNum c || c `elem` "`-_#.'/\\") s
--    in emit (tok name) p: skip (length name) p s ss k


Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to [email protected].