repomap/parser_generic.go at main · dotcommander/repomap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package repomap

import (
	"os"
	"regexp"
	"strings"
)

// maxScanLines is the maximum number of lines scanned per file.
const maxScanLines = 500

// --- TypeScript / JavaScript patterns ---

var (
	tsExportDecl    = regexp.MustCompile(`export\s+(function|class|interface|type|const|enum)\s+(\w+)`)
	tsExportDefault = regexp.MustCompile(`export\s+default\s+(function|class)\s+(\w+)`)
	tsReExport      = regexp.MustCompile(`export\s+\{([^}]+)\}`)
	tsImportFrom    = regexp.MustCompile(`import\s+.*\s+from\s+['"]([^'"]+)['"]`)
	tsRequire       = regexp.MustCompile(`require\s*\(\s*['"]([^'"]+)['"]\s*\)`)
)

// --- Python patterns ---

var (
	pyFunc   = regexp.MustCompile(`^def\s+([A-Za-z]\w*)\s*\(`)
	pyClass  = regexp.MustCompile(`^class\s+(\w+)`)
	pyConst  = regexp.MustCompile(`^([A-Z][A-Z_0-9]+)\s*=`)
	pyImport = regexp.MustCompile(`^import\s+(\w+)`)
	pyFrom   = regexp.MustCompile(`^from\s+(\w+)`)
)

// parserFunc is a language-specific line parser.
type parserFunc func(lines []string, fs *FileSymbols)

// ParseGenericFile extracts symbols from a non-Go source file using regex
// patterns. path is absolute, root is the project root for relative path
// calculation.
func ParseGenericFile(path, root, language string) (*FileSymbols, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil, err
	}

	rel := relPath(root, path)

	fs := &FileSymbols{
		Path:        rel,
		Language:    language,
		ParseMethod: "regex",
	}

	lines := strings.Split(string(data), "\n")
	if len(lines) > maxScanLines {
		lines = lines[:maxScanLines]
	}

	if fn, ok := langParsers[language]; ok {
		fn(lines, fs)
	}

	// Derive ImportPath for languages where we can determine a logical module
	// identity from filesystem layout or declarations. This enables symbol-dep
	// edges in commit grouping across non-Go files.
	if fs.ImportPath == "" {
		fs.ImportPath = deriveImportPath(path, root, language, lines)
	}

	return fs, nil
}

// parseTS processes TypeScript/JavaScript lines.
func parseTS(lines []string, fs *FileSymbols) {
	for lineIdx, line := range lines {
		trimmed := strings.TrimSpace(line)

		if m := tsExportDecl.FindStringSubmatch(trimmed); m != nil {
			fs.Symbols = append(fs.Symbols, Symbol{Name: m[2], Kind: m[1], Exported: true, Line: lineIdx + 1})
			continue
		}
		if m := tsExportDefault.FindStringSubmatch(trimmed); m != nil {
			fs.Symbols = append(fs.Symbols, Symbol{Name: m[2], Kind: m[1], Exported: true, Line: lineIdx + 1})
			continue
		}
		if m := tsReExport.FindStringSubmatch(trimmed); m != nil {
			for _, name := range splitReExportNames(m[1]) {
				fs.Symbols = append(fs.Symbols, Symbol{Name: name, Kind: "reexport", Exported: true, Line: lineIdx + 1})
			}
			continue
		}

		if m := tsImportFrom.FindStringSubmatch(trimmed); m != nil {
			fs.Imports = append(fs.Imports, m[1])
			continue
		}
		if m := tsRequire.FindStringSubmatch(trimmed); m != nil {
			fs.Imports = append(fs.Imports, m[1])
		}
	}
}

// splitReExportNames splits a re-export list like "Foo, Bar as Baz" into
// individual exported names.
func splitReExportNames(raw string) []string {
	parts := strings.Split(raw, ",")
	var names []string
	for _, p := range parts {
		p = strings.TrimSpace(p)
		if p == "" {
			continue
		}
		// Handle "Foo as Bar" — take the local name (first word)
		fields := strings.Fields(p)
		if len(fields) > 0 {
			names = append(names, fields[0])
		}
	}
	return names
}

// parsePython processes Python lines, skipping triple-quoted docstrings.
func parsePython(lines []string, fs *FileSymbols) {
	inDocstring := false
	docQuote := ""

	for lineIdx, line := range lines {
		trimmed := strings.TrimSpace(line)

		// Track triple-quoted strings used as block comments / docstrings.
		if inDocstring {
			if strings.Contains(trimmed, docQuote) {
				inDocstring = false
			}
			continue
		}
		for _, q := range []string{`"""`, `'''`} {
			if strings.HasPrefix(trimmed, q) {
				rest := trimmed[len(q):]
				if !strings.Contains(rest, q) {
					inDocstring = true
					docQuote = q
				}
				break
			}
		}
		if inDocstring {
			continue
		}

		if m := pyFunc.FindStringSubmatch(line); m != nil {
			fs.Symbols = append(fs.Symbols, Symbol{Name: m[1], Kind: "function", Exported: true, Line: lineIdx + 1})
			continue
		}
		if m := pyClass.FindStringSubmatch(line); m != nil {
			fs.Symbols = append(fs.Symbols, Symbol{Name: m[1], Kind: "class", Exported: true, Line: lineIdx + 1})
			continue
		}
		if m := pyConst.FindStringSubmatch(line); m != nil {
			fs.Symbols = append(fs.Symbols, Symbol{Name: m[1], Kind: "const", Exported: true, Line: lineIdx + 1})
			continue
		}
		if m := pyImport.FindStringSubmatch(line); m != nil {
			fs.Imports = append(fs.Imports, m[1])
			continue
		}
		if m := pyFrom.FindStringSubmatch(line); m != nil {
			fs.Imports = append(fs.Imports, m[1])
		}
	}
}