Perfect SAST. Testing parsers

While github is indexed (thanks to the limit of 5000 requests per hour), which is needed to get data for the article, let's talk about testing lexers and parsers for now. We will discuss wishes for the process of developing grammars, testing them and quality control so as not to turn into a creature in the picture.





Typical parser developer
Typical parser developer

, , - ++. , . , TDD, , 15 100% ( ). , .





,

++, , , , ++ , , . , , , .





, , :





class Example {
    boolean flag;
}
      
      



, , ? , , , , , , . .





, ( , ), , , , , . TDD, , .





, - , , , , - .





, , , ? , .





: C# , , ? ( ) :





  1. ID ? ID : ID





  2. ID ? LITERAL : LITERAL





  3. ID? ? ANY : ANY





! C# , value is MyType ? var1 : var2, , ? . Antlr . , . , , , , . , 1 100 1000 0.001 ?





, , . , , . ++ , - . , .





- , , json, , .





{
  "tokens" : [ {
    "tokenType" : "PACKAGE",
    "statement" : "1:0[7]"
  }, {
    "tokenType" : "Identifier",
    "statement" : "1:8[7]",
    "text" : "example"
  }, {
    "tokenType" : "SEMI",
    "statement" : "1:15[1]"
  }, {
    "tokenType" : "CLASS",
    "statement" : "3:0[5]"
  }, {
    "tokenType" : "Identifier",
    "statement" : "3:6[7]",
    "text" : "Example"
  }, {
    "tokenType" : "FoldBlock",
    "statement" : "3:14[21]",
    "children" : {
      "tokens" : [ {
        "tokenType" : "LBRACE",
        "statement" : "3:14[1]"
      }, {
        "tokenType" : "BOOLEAN",
        "statement" : "4:4[7]"
      }, {
        "tokenType" : "Identifier",
        "statement" : "4:12[4]",
        "text" : "flag"
      }, {
        "tokenType" : "SEMI",
        "statement" : "4:16[1]"
      }, {
        "tokenType" : "RBRACE",
        "statement" : "5:0[1]"
      }, {
        "tokenType" : "EOF",
        "statement" : "5:1[3]"
      } ]
    }
  }, {
    "tokenType" : "EOF",
    "statement" : "6:0[5]"
  } ]
}
      
      



, , - .





, , , - , - , . ParserService, , , , ( ).





, -:





  def testLexer(suite: Suite): Unit = {
    implicit val ps: ParserService = ParserService(suite.language)
    for (fi <- suite.fileInfos) {
      assertFile(fi.file)
      val lexerOutput: LexerOutput = readFromFile[LexerOutput](testConfig(LexerSuiteDir, suite, fi.file))
      val lexer = ps.newLexer(asVirtualFile(fi))
      val stream = new CommonTokenStream(lexer)
      stream.fill()
      val tokens = CollectionConverters.asScala(stream.getTokens)
      assertTokens(lexer, lexerOutput, tokens)
    }
  }
  
  private def assertTokens(lexer: AbstractRScanLexer, lexerOutput: LexerOutput, tokens: mutable.Buffer[Token]): Unit = {
    val queue = new mutable.Queue[LexerOutputToken]()
    queue ++= lexerOutput.tokens
    for (t <- tokens) {
      assertTrue(queue.nonEmpty, "Queue should not be empty")
      val lot = queue.dequeue()
      val tokenType = lexer.getVocabulary.getSymbolicName(t.getType)
      assertEquals(lot.tokenType, tokenType, "Token types are not equal")
      assertEquals(lot.statement, tokenStatement(t), "Token types are not equal")
      if (lot.text != null) {
        assertEquals(lot.text, t.getText, "Texts are not equal")
      }
      if (lot.children != null) {
        assertTrue(t.isInstanceOf[RScanToken], "Must be RScanToken")
        val rt = t.asInstanceOf[RScanToken]
        assertTrue(rt.foldedTokens().nonEmpty, "Must have fold tokens")
        assertTokens(lexer, lot.children, rt.foldedTokens().toBuffer)
      } else {
        assertFalse(t.isInstanceOf[RScanToken] && t.asInstanceOf[RScanToken].foldedTokens().nonEmpty, "No fold tokens allowed")
      }
    }
    if (queue.nonEmpty && queue.head.tokenType.equals("EOF")) queue.dequeue()
    assertTrue(queue.isEmpty, "Queue must be empty")
  }
      
      



- , , , .





, :





class JavaParserTest extends AbstractJUnitAntlr4Test {

  @ParameterizedTest(name = "[{index}] {0}")
  @MethodSource(Array("testFiles"))
  override def testLexer(suite: Suite): Unit = {
    super.testLexer(suite)
  }

  @ParameterizedTest(name = "[{index}] {0}")
  @MethodSource(Array("testFiles"))
  override def testParser(suite: Suite): Unit = {
    super.testParser(suite)
  }
}


object JavaParserTest {
  private val TestDataDirectory = "/test/java/unit"

  def testFiles: java.util.stream.Stream[Arguments] = Antlr4TestUtils.filesFromResourceDirectory(JavaLanguage.ref, TestDataDirectory, getClass, Seq("java"))
}
      
      



/test/java/unit (suite), junit testLexer, - , . !





java -Xmx128m -Dfile.encoding=UTF-8 -classpath $CLASSPATH \
  org.lastrix.rscan.test.ParserTestSuiteGenerator \
  -l Java -d $TEST_PATH -e .java
      
      



./gradlew clean test
      
      



- . , AST. AST, , , , :





{
  "languageName" : "Java",
  "items" : [ {
    "keyText" : "RAW_DECL_PACKAGE at field000.java[1:0-5:1]",
    "children" : [ {
      "keyText" : "NAME at field000.java[1:8-1:15]",
      "children" : [ {
        "keyText" : "UNRESOLVED_ID at field000.java[1:8-1:15]",
        "specText" : "example"
      } ]
    }, {
      "keyText" : "RAW_DECL_CLASS at field000.java[3:0-5:1]",
      "children" : [ {
        "keyText" : "NAME at field000.java[3:6-3:13]",
        "children" : [ {
          "keyText" : "UNRESOLVED_ID at field000.java[3:6-3:13]",
          "specText" : "Example"
        } ]
      }, {
        "keyText" : "MEMBERS at field000.java[3:14-5:1]",
        "children" : [ {
          "keyText" : "RAW_DECL_FIELD at field000.java[4:4-4:16]",
          "children" : [ {
            "keyText" : "RAW_TYPE at field000.java[4:4-4:11]",
            "children" : [ {
              "keyText" : "UNRESOLVED_ID at field000.java[4:4-4:11]",
              "specText" : "boolean"
            } ]
          }, {
            "keyText" : "RAW_DECL_VARIABLE at field000.java[4:12-4:16]",
            "children" : [ {
              "keyText" : "NAME at field000.java[4:12-4:16]",
              "children" : [ {
                "keyText" : "UNRESOLVED_ID at field000.java[4:12-4:16]",
                "specText" : "flag"
              } ]
            } ]
          } ]
        } ]
      } ]
    } ]
  } ]
}
      
      



, .





, -, , , :





  def testParser(suite: Suite): Unit = {
    implicit val ps: ParserService = ParserService(suite.language)
    for (fi <- suite.fileInfos) {
      assertFile(fi.file)
      val parserOutput: ParserOutput = readFromFile[ParserOutput](testConfig(ParserSuiteDir, suite, fi.file))
      val op = antlr4.parseFile(asVirtualFile(fi))
      assertTrue(op.isInstanceOf[RLangOp], "Must be RLangOp")
      assertEquals(parserOutput.languageName, op.asInstanceOf[RLangOp].language.name, "Languages are not equal")
      assertOperations(op.key, op.children, parserOutput.items)
    }
  }

  private def assertOperations(ownerKey: ROpKey, children: Seq[ROp], items: Seq[ParserOutputItem]): Unit = {
    if (children.isEmpty) {
      assertTrue(items == null || items.isEmpty, s"No items in operation: $ownerKey")
      return
    } else if (items == null) {
      Assertions.fail(s"Missing child operations: $ownerKey")
    }
    val queue = new mutable.Queue[ParserOutputItem]()
    queue ++= items
    for (child <- children) {
      assertTrue(queue.nonEmpty, s"Must not be empty at: ${child.key}")
      val poi = queue.dequeue()
      assertEquals(poi.keyText, child.key.toString, "Key text is not equal")
      assertEquals(poi.specText, child.specText, "SpecText is not equal")
      assertOperations(child.key, child.children, poi.children)
    }
    assertTrue(queue.isEmpty, s"Queue must be empty for: $ownerKey")
  }
      
      



, CI/CD , ( ). , , ( ), - - , . . , 100 . .





, , , "". , ?





org.lastrix.rscan.test.ParserTestSuiteGenerator, . (), , , , , ( , ?), , , Intellij IDEA - , 4 . -, , . , ?





Java, . (478 ), (intel i7 9700k) 1s 45ms, 65% .





The article proposes an approach to testing lexers and parsers, which allows you to speed up development and abandon the old-fashioned approach, when you needed to analyze thousands of huge projects to check the work of grammar in order to catch a small bug. Writing by hand a huge amount of code to check the result of work - all this can be automated, except perhaps to check the result of generation. While you cannot completely abandon the large code testing option, you can reduce the need to use this approach. A detected bug in large code can be easily reproduced in small code and in the future, this problem can be controlled.








All Articles