Skip to content

Commit c5adf93

Browse files
committed
Parse code fences with goldmark (take 2)
This commit is another attempt at #2937. It changes how the bridge finds code blocks by leveraging goldmark.
1 parent 3422ae0 commit c5adf93

13 files changed

+1377
-139
lines changed

Makefile

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ install_plugins::
1111
pulumi plugin install resource aws 6.22.2
1212
pulumi plugin install resource archive 0.0.4
1313
pulumi plugin install resource wavefront 3.0.0
14+
pulumi plugin install resource auth0 3.16.0
15+
pulumi plugin install resource http 0.0.11
16+
pulumi plugin install resource gcp 8.22.0
1417
pulumi plugin install resource equinix 0.6.0 --server github://api.github.com/equinix
1518

1619
build::

pkg/tfgen/docs.go

+129-120
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
bf "github.com/russross/blackfriday/v2"
3838
"github.com/spf13/afero"
3939
"github.com/yuin/goldmark"
40+
"github.com/yuin/goldmark/ast"
4041
gmast "github.com/yuin/goldmark/ast"
4142
gmtext "github.com/yuin/goldmark/text"
4243
"golang.org/x/text/cases"
@@ -46,6 +47,7 @@ import (
4647
"github.com/pulumi/pulumi-terraform-bridge/v3/pkg/tfbridge"
4748
"github.com/pulumi/pulumi-terraform-bridge/v3/pkg/tfbridge/info"
4849
"github.com/pulumi/pulumi-terraform-bridge/v3/pkg/tfgen/parse"
50+
"github.com/pulumi/pulumi-terraform-bridge/v3/pkg/tfgen/parse/section"
4951
)
5052

5153
const (
@@ -1511,70 +1513,70 @@ func (g *Generator) convertExamples(docs string, path examplePath) string {
15111513
// codeBlock represents a code block found in the upstream docs, delineated by code fences (```).
15121514
// It also tracks which header it is part of.
15131515
type codeBlock struct {
1514-
start int // The index of the first backtick of an opening code fence
1515-
end int // The index of the first backtick of a closing code fence
1516-
headerStart int // The index of the first "#" in a Markdown header. A value of -1 indicates there's no header.
1517-
}
1518-
1519-
func findCodeBlock(doc string, i int) (codeBlock, bool) {
1520-
codeFence := "```"
1521-
var block codeBlock
1522-
// find opening code fence
1523-
if doc[i:i+len(codeFence)] == codeFence {
1524-
block.start = i
1525-
// find closing code fence
1526-
for j := i + len(codeFence); j < (len(doc) - len(codeFence)); j++ {
1527-
if doc[j:j+len(codeFence)] == codeFence {
1528-
block.end = j
1529-
return block, true
1530-
}
1531-
}
1532-
return block, false
1533-
}
1534-
return block, false
1516+
start int // The index of the first backtick of an opening code fence
1517+
end int // The index of the first backtick of a closing code fence
1518+
headerStart int // The index of the first "#" in a Markdown header. A value of -1 indicates there's no header.
1519+
language string // The language of the code block.
15351520
}
15361521

1537-
func findHeader(doc string, i int) (int, bool) {
1538-
h2 := "##"
1539-
h3 := "###"
1540-
var foundH2, foundH3 bool
1541-
1542-
if i == 0 {
1543-
// handle header at very beginning of doc
1544-
foundH2 = doc[i:i+len(h2)] == h2
1545-
foundH3 = doc[i:i+len(h3)] == h3
1546-
} else {
1547-
// all other headers must be preceded by a newline
1548-
foundH2 = doc[i:i+len(h2)] == h2 && string(doc[i-1]) == "\n"
1549-
foundH3 = doc[i:i+len(h3)] == h3 && string(doc[i-1]) == "\n"
1550-
}
1551-
1552-
if foundH3 {
1553-
return i + len(h3), true
1554-
}
1555-
if foundH2 {
1556-
return i + len(h2), true
1557-
}
1558-
return -1, false
1522+
// A string representing the code inside a code block.
1523+
//
1524+
// Given the code block:
1525+
//
1526+
// ```sh
1527+
// $ cmd \
1528+
// --flag
1529+
//
1530+
// ```
1531+
//
1532+
// This method would return "$ cmd \\\n --flag\n".
1533+
//
1534+
// The returned string represents a view into the passed in byte slice, and does not
1535+
// remove any padding found in the original document.
1536+
func (cb codeBlock) code(document []byte) string {
1537+
nextNewLine := bytes.IndexRune(document[cb.start:cb.end], '\n')
1538+
return string(document[cb.start+nextNewLine+1 : cb.end])
15591539
}
15601540

1561-
func findFencesAndHeaders(doc string) []codeBlock {
1562-
codeFence := "```"
1541+
func findCodeBlocks(docs []byte) []codeBlock {
1542+
rootNode := goldmark.New(goldmark.WithExtensions(parse.TFRegistryExtension)).
1543+
Parser().Parse(gmtext.NewReader(docs))
1544+
15631545
var codeBlocks []codeBlock
1564-
headerStart := -1
1565-
for i := 0; i < (len(doc) - len(codeFence)); i++ {
1566-
block, found := findCodeBlock(doc, i)
1567-
if found {
1568-
block.headerStart = headerStart
1569-
codeBlocks = append(codeBlocks, block)
1570-
i = block.end + 1
1546+
parse.WalkNode(rootNode, func(cb *ast.FencedCodeBlock) {
1547+
lines := cb.Lines()
1548+
1549+
headerStart := -1
1550+
for p := cb.Parent(); p != nil; p = p.Parent() {
1551+
if s, ok := p.(*section.Section); ok {
1552+
l := s.FirstChild().Lines()
1553+
if l.Len() == 0 {
1554+
// A header doesn't have any lines if there is no text associated with the
1555+
// header, then we can't find its location due to limitations of goldmark.
1556+
//
1557+
// Just give up on finding a header here.
1558+
break
1559+
}
1560+
headerStart = bytes.LastIndexByte(docs[:l.At(0).Start], '\n') + 1
1561+
break
1562+
}
15711563
}
1572-
headerEnd, found := findHeader(doc, i)
1573-
if found {
1574-
headerStart = i
1575-
i = headerEnd
1564+
1565+
firstNewlineOfCodeBlock := bytes.LastIndexByte(docs[:lines.At(0).Start], '\n')
1566+
firstNewlineOfCodeFence := bytes.LastIndexByte(docs[:firstNewlineOfCodeBlock], '\n')
1567+
if firstNewlineOfCodeFence == -1 {
1568+
// This means that docs starts with a code block
1569+
firstNewlineOfCodeFence = 0
15761570
}
1577-
}
1571+
firstBacktickOfCodeFence := bytes.IndexByte(docs[firstNewlineOfCodeFence:], '`') + firstNewlineOfCodeFence
1572+
1573+
codeBlocks = append(codeBlocks, codeBlock{
1574+
start: firstBacktickOfCodeFence,
1575+
end: lines.At(lines.Len() - 1).Stop,
1576+
headerStart: headerStart,
1577+
language: string(cb.Language(docs)),
1578+
})
1579+
})
15781580
return codeBlocks
15791581
}
15801582

@@ -1593,14 +1595,13 @@ func (g *Generator) convertExamplesInner(
15931595
_, err := fmt.Fprintf(output, f, args...)
15941596
contract.AssertNoErrorf(err, "Cannot fail to write out output buffer")
15951597
}
1596-
codeBlocks := findFencesAndHeaders(docs)
15971598
const codeFence = "```"
15981599

15991600
// Traverse the code blocks and take appropriate action before appending to output
16001601
textStart := 0
16011602
stripSection := false
1602-
stripSectionHeader := 0
1603-
for _, tfBlock := range codeBlocks {
1603+
stripSectionHeader := 0 // The index of the header that we might want to strip.
1604+
for _, tfBlock := range findCodeBlocks([]byte(docs)) {
16041605
// if the section has a header we append the header after trying to convert the code.
16051606
hasHeader := tfBlock.headerStart >= 0 && textStart < tfBlock.headerStart
16061607

@@ -1615,75 +1616,77 @@ func (g *Generator) convertExamplesInner(
16151616
// if we are stripping this section and still have the same header, we append nothing and skip to the next
16161617
// code block.
16171618
if stripSectionHeader == tfBlock.headerStart {
1618-
textStart = tfBlock.end + len(codeFence)
1619+
if eol := strings.IndexRune(docs[tfBlock.end:], '\n'); eol > -1 {
1620+
textStart = tfBlock.end + eol
1621+
} else {
1622+
// If no newline character is found, we are at the end of the doc.
1623+
textStart = len(docs)
1624+
}
16191625
continue
16201626
}
16211627
if stripSectionHeader < tfBlock.headerStart {
16221628
stripSection = false
16231629
}
16241630
}
1625-
// find the actual start index of the code
1626-
nextNewLine := strings.Index(docs[tfBlock.start:tfBlock.end], "\n")
1627-
if nextNewLine == -1 {
1628-
// write the line as-is; this is an in-line fence
1629-
fprintf("%s%s", docs[tfBlock.start:tfBlock.end], codeFence)
1630-
} else {
1631-
fenceLanguage := docs[tfBlock.start : tfBlock.start+nextNewLine+1]
1632-
hcl := docs[tfBlock.start+nextNewLine+1 : tfBlock.end]
1633-
1634-
// Only attempt to convert code blocks that are either explicitly marked as Terraform, or
1635-
// unmarked. For unmarked snippets further gate by a regex guess if it is actually Terraform.
1636-
if isHCL(fenceLanguage, hcl) {
1637-
// generate the code block and append
1638-
if g.language.shouldConvertExamples() {
1639-
hcl := docs[tfBlock.start+nextNewLine+1 : tfBlock.end]
1640-
1641-
// Most of our results should be HCL, so we try to convert it.
1642-
var e *Example
1643-
if useCoverageTracker {
1644-
e = g.coverageTracker.getOrCreateExample(
1645-
path.String(), hcl)
1631+
// Only attempt to convert code blocks that are either explicitly marked as Terraform, or
1632+
// unmarked. For unmarked snippets further gate by a regex guess if it is actually Terraform.
1633+
if hcl := tfBlock.code([]byte(docs)); isHCL(tfBlock.language, hcl) {
1634+
// generate the code block and append
1635+
if g.language.shouldConvertExamples() {
1636+
// Most of our results should be HCL, so we try to convert it.
1637+
var e *Example
1638+
if useCoverageTracker {
1639+
e = g.coverageTracker.getOrCreateExample(
1640+
path.String(), hcl)
1641+
}
1642+
langs := genLanguageToSlice(g.language)
1643+
convertedBlock, err := convertHCL(e, hcl, path.String(), langs)
1644+
if err != nil {
1645+
// We do not write this section, ever.
1646+
//
1647+
// We have to strip the entire section: any header, the code
1648+
// block, and any surrounding text.
1649+
stripSection = true
1650+
stripSectionHeader = tfBlock.headerStart
1651+
} else {
1652+
// append any headers and following text first
1653+
if hasHeader {
1654+
fprintf("%s", docs[tfBlock.headerStart:tfBlock.start])
16461655
}
1647-
langs := genLanguageToSlice(g.language)
1648-
convertedBlock, err := convertHCL(e, hcl, path.String(), langs)
1649-
if err != nil {
1650-
// We do not write this section, ever.
1651-
//
1652-
// We have to strip the entire section: any header, the code
1653-
// block, and any surrounding text.
1654-
stripSection = true
1655-
stripSectionHeader = tfBlock.headerStart
1656-
} else {
1657-
// append any headers and following text first
1658-
if hasHeader {
1659-
fprintf("%s", docs[tfBlock.headerStart:tfBlock.start])
1660-
}
1661-
1662-
switch g.language {
1663-
// If we are targeting the schema, then print code switcher
1664-
// fences for the registry.
1665-
case Schema:
1666-
fprintf("%s\n%s\n%s",
1667-
startPulumiCodeChooser,
1668-
convertedBlock,
1669-
endPulumiCodeChooser)
1670-
// Otherwise skip code switcher fences so they don't show up
1671-
// in generated SDKs.
1672-
default:
1673-
fprintf("%s", convertedBlock)
1674-
}
1656+
1657+
switch g.language {
1658+
// If we are targeting the schema, then print code switcher
1659+
// fences for the registry.
1660+
case Schema:
1661+
fprintf("%s\n%s\n%s",
1662+
startPulumiCodeChooser,
1663+
convertedBlock,
1664+
endPulumiCodeChooser)
1665+
// Otherwise skip code switcher fences so they don't show up
1666+
// in generated SDKs.
1667+
default:
1668+
fprintf("%s", convertedBlock)
16751669
}
16761670
}
1677-
} else {
1678-
// Take already-valid code blocks as-is.
1679-
if hasHeader {
1680-
fprintf("%s", docs[tfBlock.headerStart:tfBlock.start])
1681-
}
1682-
fprintf("%s"+codeFence, docs[tfBlock.start:tfBlock.end])
16831671
}
1672+
} else {
1673+
// Take already-valid code blocks as-is.
1674+
if hasHeader {
1675+
fprintf("%s", docs[tfBlock.headerStart:tfBlock.start])
1676+
}
1677+
fprintf("%s"+codeFence, docs[tfBlock.start:tfBlock.end])
1678+
}
1679+
1680+
// We want to start including non-code text after the end of the code block.
1681+
//
1682+
// The codeblock "ends" with the newline character at the end of the
1683+
// closing fence.
1684+
if eol := strings.IndexRune(docs[tfBlock.end:], '\n'); eol > -1 {
1685+
textStart = tfBlock.end + eol
1686+
} else {
1687+
// If no newline character is found, we are at the end of the doc.
1688+
textStart = len(docs)
16841689
}
1685-
// The non-code text starts up again after the last closing fences
1686-
textStart = tfBlock.end + len(codeFence)
16871690
}
16881691
// Append any remainder of the docs string to the output
16891692
if !stripSection {
@@ -2342,6 +2345,12 @@ func guessIsHCL(code string) bool {
23422345
}
23432346

23442347
func isHCL(fenceLanguage, code string) bool {
2345-
return fenceLanguage == "```terraform\n" || fenceLanguage == "```hcl\n" || fenceLanguage == "```tf\n" ||
2346-
(fenceLanguage == "```\n" && guessIsHCL(code))
2348+
switch fenceLanguage {
2349+
case "terraform", "hcl", "tf":
2350+
return true
2351+
case "":
2352+
return guessIsHCL(code)
2353+
default:
2354+
return false
2355+
}
23472356
}

0 commit comments

Comments
 (0)