Haskell: Forming an AST with Parsec

July 31, 2020

After a few iterations, we now have a parser that takes a Java source file and parses it in a way that distinguishes between comments and code. The previous parser produces a [String], where each String is either a comment or code. But all the parts have the same type, which means that without reparsing somehow – at a minimum, looking at the first few characters of each String – there is no way to tell whether a given String is code or comment.

In this version, the Java source is parsed to an AST, although the "AST" is not very Abstract and is certainly not a Tree – about all it does involve is Syntax. The program starts off just as in the previous two cases (with one small change in red):

{-# LANGUAGE LambdaCase #-}

import System.Environment
import System.Directory
import Text.ParserCombinators.Parsec
import Control.Monad

main = do
    args <- getArgs
    
    (argsValid args) >>= \case
        False -> putStrLn "Give me a single valid file name."
        True -> do
            contents <- readFile $ head args
            let result = parseJava contents
            case result of 
                Left err -> putStrLn (show(err))
                Right valid -> mapM_ putStr $ map show valid

argsValid :: [String] -> IO Bool
argsValid names = do
    if (null names) || (length names /= 1)
      then return False
      else doesFileExist $ head $ names

The parser is also nearly identical, but the top level of the parser no longer returns [String]; instead it returns [ParsedJava] (or here is the complete version):

data ParsedJava = SLComment String | MLComment String | 
                    JavaCode String | WhiteSpace String

instance Show ParsedJava where
  show (SLComment s) = s
  show (MLComment s) = s
  show (JavaCode s) = s
  show (WhiteSpace s) = s

parseJava :: String -> Either ParseError [ParsedJava]
parseJava input = parse parseJavaInput "" input

parseJavaInput :: Parser [ParsedJava]
parseJavaInput = manyTill javaPart eof

javaPart :: Parser ParsedJava
javaPart = parseSLComment <|> parseMLComment <|> javaCode

parseSLComment :: Parser ParsedJava
parseSLComment = do
  try $ string "//"
  guts <- manyTill anyChar (try $ string "\n")
  return $ SLComment ("//" ++ guts ++ "\n")

parseMLComment :: Parser ParsedJava
parseMLComment = do
  try $ string "/*"
  guts <- manyTill anyChar (try $ string "*/")
  return $ MLComment ("/*" ++ guts ++ "*/")

javaCode :: Parser ParsedJava
javaCode = do
  x <- many1 javaBite
  return $ JavaCode $ concat x

javaBite :: Parser String
javaBite = stringCode <|> quoteCode <|> nonStringCode

stringCode :: Parser String
stringCode = do
  try $ char '"'
  x <- manyTill stringChar (string "\"")
  return ("\"" ++ concat x ++ "\"")

stringChar :: Parser String
stringChar = stringNonEscape <|> stringEscape

stringNonEscape :: Parser String
stringNonEscape = do
  x <- noneOf "\\\""
  return [x]

stringEscape :: Parser String
stringEscape = do
  d <- char '\\'
  c <- oneOf "\\\"0nrvtbf'u"
  return [d,c]

quoteCode :: Parser String
quoteCode = do
  try $ char '\''
  x <- manyTill quoteChar (string "'")
  return ("'" ++ concat x ++ "'")

quoteChar :: Parser String
quoteChar = quoteNonEscape <|> quoteEscape

quoteNonEscape :: Parser String
quoteNonEscape = do
  x <- noneOf "\\'"
  return [x]

quoteEscape :: Parser String
quoteEscape = do
  d <- char '\\'
  c <- oneOf "\\\"0nrvtbf'u"
  return [d,c]

nonStringCode :: Parser String
nonStringCode = (many1 $ noneOf "/'\"") <|> try goodSlash

goodSlash :: Parser String
goodSlash = do
  x <- char '/'
  y <- noneOf "/*"
  return [x,y]

Prev

Contact

Next