Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ know what they are looking for.
| Bun | `npm` | `bun.lock`; `bun.lockb` presence as diagnostic |
| PyPI | `pypi` | `*.dist-info/METADATA`, `INSTALLER`, `direct_url.json`, `*.egg-info/PKG-INFO` |
| Go modules | `go` | `go.sum`, `go.mod` |
| Cargo (Rust) | `crates.io` | `~/.cargo/.crates2.json` (installed binaries), `Cargo.lock` |
| RubyGems | `rubygems` | `Gemfile.lock`, installed `*.gemspec` |
| Composer | `packagist` | `composer.lock`, `vendor/composer/installed.json` |
| MCP | `mcp` | JSON host configs: `mcp.json`, `.mcp.json`, `claude_desktop_config.json`, `mcp_config.json`, `mcp_settings.json`, `cline_mcp_settings.json`, plus `~/.gemini/settings.json` (Gemini CLI / Code Assist) and `~/.claude.json` (Claude Code user- and project-scoped `mcpServers`). Non-JSON configs (Codex `config.toml`, Continue YAML) are not parsed in v0.1. |
Expand Down
46 changes: 45 additions & 1 deletion docs/inventory-sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,51 @@ References:
- `go.sum` and `go.mod` reference: <https://go.dev/ref/mod#go-sum-files>
- Module cache layout: <https://go.dev/ref/mod#module-cache>

## Cargo (Rust)

Files read:

- `~/.cargo/.crates2.json` — Cargo's JSON record of every binary installed
via `cargo install`. Each `installs` entry's key is a
`"<name> <version> (<source>)"` triple that produces a high-confidence
record marked `direct_dependency=true` (every entry was an explicit
`cargo install` invocation). Dispatch is path-aware: the file is parsed
only when its parent directory is named `.cargo`.
- `Cargo.lock` — Cargo's TOML lockfile. Each `[[package]]` block with a
non-empty `source` produces a high-confidence record; this covers
registry crates as well as `git+…` and other sourced dependencies.
Blocks without a `source` are workspace-local crates (the root package
and any path dependencies) and are skipped: they have no registry
coordinate and cannot match a published-package exposure catalog entry.

Captured fields emitted on the record: `package_name`, `version`,
`package_manager=cargo`, `source_type` (`cargo-crates2-installs` or
`cargo-lock`), `confidence=high`, and `direct_dependency` on
`.crates2.json` records.

The user-package baseline already walks `~/.cargo`, so the installs file
is picked up without additional configuration. `Cargo.toml` (the
manifest) is intentionally not parsed: its version requirements are
ranges rather than exact pins and would produce ambiguous records.

We do not run `cargo install --list`, `cargo metadata`, or any other
Cargo subcommand.

Because the user-package baseline walks `~/.cargo` and dispatch on
`Cargo.lock` is by basename, any `Cargo.lock` under
`~/.cargo/registry/src/<index>/<crate>-<version>/` (the lockfile the
crate's author shipped at publish time) is parsed and emitted with
`source_file` inside that directory. These records reflect "what some
upstream crate author pinned," not "what the user selected," so they
overstate exposure on developer hosts and should be filtered downstream
by `source_file` prefix if that is undesirable. The `.crate` tarballs in
`~/.cargo/registry/cache/` are not unpacked and contribute no records.

References:

- Cargo `.crates2.json` install record: <https://doc.rust-lang.org/cargo/commands/cargo-install.html>
- Cargo lockfile reference: <https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html>

## RubyGems / Bundler

Files read:
Expand Down Expand Up @@ -650,7 +695,6 @@ strong installed-state correlation tooling today.

## Not currently covered

- Cargo (`Cargo.lock`).
- Maven / Gradle (`pom.xml`, lockfiles).
- NuGet (`packages.lock.json`).
- Hex (`mix.lock`).
Expand Down
280 changes: 280 additions & 0 deletions internal/ecosystem/cargo/cargo.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
// Package cargo scans Rust/Cargo package artifacts.
//
// Two on-disk surfaces are read:
//
// - `~/.cargo/.crates2.json` — the canonical record of every binary
// installed via `cargo install`. Highest-confidence baseline source:
// each entry names the crate, version, and source registry.
// - `Cargo.lock` — TOML lockfile listing the resolved dependency tree.
// Higher-confidence than `Cargo.toml` because versions are pinned.
//
// No `cargo` commands are executed. Detection is path-/filename-based.
// `Cargo.toml` (the manifest) is intentionally not parsed: version
// requirements there are ranges rather than exact pins and would
// produce ambiguous records.
package cargo

import (
"bufio"
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"

"github.com/perplexityai/bumblebee/internal/model"
)

const Ecosystem = model.EcosystemCargo

const (
packageManager = "cargo"
crates2SourceType = "cargo-crates2-installs"
cargoLockSourceType = "cargo-lock"
crates2FileName = ".crates2.json"
cargoLockFileName = "Cargo.lock"
cargoDirName = ".cargo"
)

type Scanner struct {
MaxFileSize int64
Emit func(model.Record)
Diag func(level, path, msg string)
}

// IsCargoLock reports whether base is a Cargo lockfile.
func IsCargoLock(base string) bool { return base == cargoLockFileName }

// IsCrates2JSON reports whether path is `<cargo home>/.crates2.json`.
// Dispatch is path-aware rather than basename-only because `.crates2.json`
// is unique to Cargo and only meaningful inside a Cargo home directory.
func IsCrates2JSON(path string) bool {
return filepath.Base(path) == crates2FileName &&
filepath.Base(filepath.Dir(path)) == cargoDirName
}

// crates2File is the on-disk shape of `~/.cargo/.crates2.json`. Cargo
// writes a single `installs` object whose keys are
// `"<name> <version> (<source>)"` triples. The value carries install
// metadata; we only consult `bins` to record whether the entry produced
// any binaries (a hint that informs the high-confidence default).
type crates2File struct {
Installs map[string]crates2Install `json:"installs"`
}

type crates2Install struct {
Bins []string `json:"bins"`
}

func (s *Scanner) ScanCrates2JSON(path string, base model.Record) error {
data, err := s.readBounded(path)
if err != nil {
return err
}
var doc crates2File
if err := json.Unmarshal(data, &doc); err != nil {
if s.Diag != nil {
s.Diag("warn", path, "skipping malformed .crates2.json: "+err.Error())
}
return nil
}
projectPath := filepath.Dir(path)
for key := range doc.Installs {
name, version, ok := parseCrates2InstallKey(key)
if !ok {
continue
}
r := base
r.Ecosystem = Ecosystem
r.PackageName = name
r.NormalizedName = strings.ToLower(name)
r.Version = version
r.ProjectPath = projectPath
r.PackageManager = packageManager
r.SourceType = crates2SourceType
r.SourceFile = path
// `.crates2.json` only records crates the user explicitly ran
// `cargo install` on, so every entry is a direct dependency.
direct := true
r.DirectDependency = &direct
r.Confidence = "high"
s.Emit(r)
}
return nil
}

// parseCrates2InstallKey splits a `.crates2.json` install key into its
// crate name and version. The key shape is
// `"<name> <version> (<source>)"` — a crate-name token, a SemVer token,
// then a parenthesized source descriptor. Crate names never contain
// spaces or parentheses, so a left-to-right split on the first two
// spaces is unambiguous.
func parseCrates2InstallKey(key string) (name, version string, ok bool) {
key = strings.TrimSpace(key)
if key == "" {
return "", "", false
}
sp1 := strings.IndexByte(key, ' ')
if sp1 <= 0 {
return "", "", false
}
name = key[:sp1]
rest := key[sp1+1:]
sp2 := strings.IndexByte(rest, ' ')
if sp2 <= 0 {
// No source segment; tolerate `"<name> <version>"` shape.
version = strings.TrimSpace(rest)
return name, version, version != ""
}
version = rest[:sp2]
return name, version, name != "" && version != ""
}

// ScanCargoLock emits a Record for every third-party crate recorded
// in a Cargo.lock file. The lockfile is the authoritative list of
// resolved package versions for a Rust project, including transitive
// dependencies pulled in from a registry.
func (s *Scanner) ScanCargoLock(path string, base model.Record) error {
data, err := s.readBounded(path)
if err != nil {
return err
}
projectPath := filepath.Dir(path)
pkgs := parseCargoLockPackages(data)
seen := make(map[string]struct{}, len(pkgs))
for _, p := range pkgs {
if p.name == "" || p.version == "" {
continue
}
// Skip workspace-local crates (root package and path-dependency
// siblings): they are the user's own code, not registry-sourced
// third-party artifacts. Catalog matching is name+version only and
// doesn't consult the source, so a local crate sharing a name with
// a published malicious one would otherwise produce a false positive.
if p.source == "" {
continue
}
key := p.name + "\x00" + p.version
if _, dup := seen[key]; dup {
continue
}
seen[key] = struct{}{}
r := base
r.Ecosystem = Ecosystem
r.PackageName = p.name
r.NormalizedName = strings.ToLower(p.name)
r.Version = p.version
r.ProjectPath = projectPath
r.PackageManager = packageManager
r.SourceType = cargoLockSourceType
r.SourceFile = path
r.Confidence = "high"
s.Emit(r)
}
return nil
}

type cargoLockPackage struct {
name string
version string
source string
}

// parseCargoLockPackages scans a Cargo.lock TOML body for `[[package]]`
// blocks and pulls name/version/source from each. The parser is
// deliberately minimal: Cargo.lock is machine-generated with a stable
// shape (one quoted-string value per line, no inline tables for the
// fields we care about), so a line-oriented scan is sufficient and
// keeps the scanner dependency-free.
func parseCargoLockPackages(data []byte) []cargoLockPackage {
var out []cargoLockPackage
sc := bufio.NewScanner(bytes.NewReader(data))
sc.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
inPackage := false
var current cargoLockPackage
flush := func() {
if inPackage {
out = append(out, current)
}
current = cargoLockPackage{}
inPackage = false
}
for sc.Scan() {
raw := sc.Text()
line := strings.TrimSpace(raw)
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.HasPrefix(line, "[") {
flush()
if line == "[[package]]" {
inPackage = true
}
continue
}
if !inPackage {
continue
}
key, value, ok := parseCargoLockField(line)
if !ok {
continue
}
switch key {
case "name":
current.name = value
case "version":
current.version = value
case "source":
current.source = value
}
}
flush()
return out
}

// parseCargoLockField extracts the key and quoted-string value from a
// line shaped like `key = "value"`.
func parseCargoLockField(line string) (key, value string, ok bool) {
eq := strings.IndexByte(line, '=')
if eq <= 0 {
return "", "", false
}
key = strings.TrimSpace(line[:eq])
rest := strings.TrimSpace(line[eq+1:])
if len(rest) < 2 || rest[0] != '"' {
return "", "", false
}
rest = rest[1:]
end := strings.IndexByte(rest, '"')
if end < 0 {
return "", "", false
}
return key, rest[:end], true
}

// readBounded opens path and returns its contents, refusing anything
// that is not a regular file or that exceeds MaxFileSize
func (s *Scanner) readBounded(path string) ([]byte, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
info, err := f.Stat()
if err != nil {
return nil, err
}
if !info.Mode().IsRegular() {
return nil, errors.New("not a regular file")
}
if s.MaxFileSize > 0 && info.Size() > s.MaxFileSize {
if s.Diag != nil {
s.Diag("warn", path, fmt.Sprintf("skipping: size %d exceeds max %d", info.Size(), s.MaxFileSize))
}
return nil, fmt.Errorf("file %s exceeds max size %d", path, s.MaxFileSize)
}
return io.ReadAll(f)
}
Loading