package rustdocs import ( "net/url" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" ) type Parser struct { baseURL string } func NewParser() *Parser { return &Parser{ baseURL: "https://docs.rs", } } func (p *Parser) ParseCratePage(html string, docURL string) (*Crate, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } crate := &Crate{ DocURL: docURL, FetchedAt: time.Now(), } crate.Name = p.extractCrateName(doc) crate.Version = p.extractVersion(doc) crate.Description = p.extractDescription(doc) crate.Repository = p.extractRepository(doc) crate.Modules = p.extractModules(doc) crate.Structs = p.extractStructs(doc) crate.Enums = p.extractEnums(doc) crate.Traits = p.extractTraits(doc) crate.Functions = p.extractFunctions(doc) crate.Macros = p.extractMacros(doc) crate.Constants = p.extractConstants(doc) crate.Statics = p.extractStatics(doc) return crate, nil } func (p *Parser) ParseItemPage(html string, docURL string) (*Symbol, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } symbol := &Symbol{ DocURL: docURL, } symbol.Name = p.extractItemName(doc) symbol.Path = p.extractItemPath(doc, docURL) symbol.Kind = p.extractItemKind(doc) symbol.Signature = p.extractItemSignature(doc) symbol.Doc = p.extractItemDoc(doc) return symbol, nil } func (p *Parser) ParseSearchResults(html string) ([]*SearchResult, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return nil, err } var results []*SearchResult doc.Find("#results .search-results a").Each(func(i int, s *goquery.Selection) { result := &SearchResult{} classes, _ := s.Attr("class") result.Kind = extractKindFromClasses(classes) nameEl := s.Find(".result-name") result.Name = strings.TrimSpace(nameEl.Find(".method, .struct, .fn, .trait, .enum, .mod, .macro, .const, .static, .attr").Text()) if result.Name == "" { nameText := nameEl.Text() result.Name = strings.TrimSpace(strings.Split(nameText, "\n")[0]) } var pathParts []string nameEl.Find(".path span").Each(func(_ int, span *goquery.Selection) { part := strings.TrimSpace(span.Text()) if part != "" { pathParts = append(pathParts, part) } }) result.Path = strings.Join(pathParts, "::") result.Description = strings.TrimSpace(s.Find(".desc").Text()) if href, exists := s.Attr("href"); exists { if strings.HasPrefix(href, "http") { result.DocURL = href } else { u, err := url.Parse("https://docs.rs") if err == nil { u.Path = href result.DocURL = u.String() } } } stabilityEl := s.Find(".stab") if stabilityEl.Length() > 0 { if stabilityEl.HasClass("unstable") || stabilityEl.HasClass("experimental") { result.IsExperimental = true } result.Stability = strings.TrimSpace(stabilityEl.Text()) } results = append(results, result) }) return results, nil } func (p *Parser) extractCrateName(doc *goquery.Document) string { title := doc.Find(".main-heading h1").Text() title = strings.TrimSpace(title) if strings.HasPrefix(title, "Crate ") { return strings.TrimPrefix(title, "Crate ") } if strings.HasPrefix(title, "Module ") { return strings.TrimPrefix(title, "Module ") } h1 := doc.Find("h1").First().Text() h1 = strings.TrimSpace(h1) if strings.HasPrefix(h1, "Crate ") { return strings.TrimPrefix(h1, "Crate ") } return title } func (p *Parser) extractVersion(doc *goquery.Document) string { since := doc.Find(".since").Text() if since != "" { re := regexp.MustCompile(`\d+\.\d+\.\d+`) if match := re.FindString(since); match != "" { return match } } subHeading := doc.Find(".sub-heading").Text() re := regexp.MustCompile(`v?(\d+\.\d+\.\d+)`) if match := re.FindStringSubmatch(subHeading); len(match) > 1 { return match[1] } return "" } func (p *Parser) extractDescription(doc *goquery.Document) string { topDoc := doc.Find(".top-doc .docblock").First() if topDoc.Length() > 0 { return strings.TrimSpace(topDoc.Text()) } topDoc = doc.Find(".docblock").First() if topDoc.Length() > 0 { return strings.TrimSpace(topDoc.Text()) } return "" } func (p *Parser) extractRepository(doc *goquery.Document) string { srcLink := doc.Find("a.src") if srcLink.Length() > 0 { if href, exists := srcLink.Attr("href"); exists { if strings.Contains(href, "github.com") { re := regexp.MustCompile(`https://github\.com/[^/]+/[^/]+`) if match := re.FindString(href); match != "" { return match } } } } return "" } func (p *Parser) extractItemName(doc *goquery.Document) string { h1 := doc.Find(".main-heading h1").Text() h1 = strings.TrimSpace(h1) for _, prefix := range []string{"Struct ", "Enum ", "Trait ", "Fn ", "Macro ", "Const ", "Static ", "Module ", "Type "} { if strings.HasPrefix(h1, prefix) { return strings.TrimPrefix(h1, prefix) } } return h1 } func (p *Parser) extractItemPath(doc *goquery.Document, docURL string) string { breadcrumbs := doc.Find(".rustdoc-breadcrumbs").Text() breadcrumbs = strings.TrimSpace(breadcrumbs) breadcrumbs = strings.ReplaceAll(breadcrumbs, "\n", "") breadcrumbs = strings.ReplaceAll(breadcrumbs, " ", " ") breadcrumbs = strings.TrimSpace(breadcrumbs) if breadcrumbs != "" { return breadcrumbs } if docURL != "" { u, err := url.Parse(docURL) if err == nil { path := strings.TrimPrefix(u.Path, "/") path = strings.TrimSuffix(path, "/index.html") path = strings.TrimSuffix(path, ".html") path = strings.ReplaceAll(path, "/", "::") return path } } return "" } func (p *Parser) extractItemKind(doc *goquery.Document) ItemKind { h1 := doc.Find(".main-heading h1 span").First() if h1.Length() > 0 { class, _ := h1.Attr("class") switch { case strings.Contains(class, "struct"): return ItemKindStruct case strings.Contains(class, "enum"): return ItemKindEnum case strings.Contains(class, "trait"): return ItemKindTrait case strings.Contains(class, "fn"): return ItemKindFn case strings.Contains(class, "macro"): return ItemKindMacro case strings.Contains(class, "const"): return ItemKindConst case strings.Contains(class, "static"): return ItemKindStatic case strings.Contains(class, "mod"): return ItemKindMod case strings.Contains(class, "type"): return ItemKindType } } title := doc.Find(".main-heading h1").Text() switch { case strings.HasPrefix(title, "Struct "): return ItemKindStruct case strings.HasPrefix(title, "Enum "): return ItemKindEnum case strings.HasPrefix(title, "Trait "): return ItemKindTrait case strings.HasPrefix(title, "Fn ") || strings.HasPrefix(title, "Function "): return ItemKindFn case strings.HasPrefix(title, "Macro "): return ItemKindMacro case strings.HasPrefix(title, "Const "): return ItemKindConst case strings.HasPrefix(title, "Static "): return ItemKindStatic case strings.HasPrefix(title, "Module "): return ItemKindMod } return "" } func (p *Parser) extractItemSignature(doc *goquery.Document) string { sig := doc.Find("pre.rust.item-decl").Text() sig = strings.TrimSpace(sig) if sig != "" { return sig } sig = doc.Find("pre.rust").First().Text() return strings.TrimSpace(sig) } func (p *Parser) extractItemDoc(doc *goquery.Document) string { docblock := doc.Find(".top-doc .docblock").First() if docblock.Length() > 0 { return strings.TrimSpace(docblock.Text()) } docblock = doc.Find(".docblock").First() if docblock.Length() > 0 { return strings.TrimSpace(docblock.Text()) } return "" } func (p *Parser) extractModules(doc *goquery.Document) []*Module { var modules []*Module doc.Find(".item-table .mod, .module-item .mod").Each(func(_ int, s *goquery.Selection) { mod := &Module{} mod.Name = strings.TrimSpace(s.Find("a.mod").Text()) if mod.Name == "" { mod.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { mod.DocURL = resolveURL(p.baseURL, href) } mod.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) mod.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if mod.Name != "" { modules = append(modules, mod) } }) return modules } func (p *Parser) extractStructs(doc *goquery.Document) []*Struct { var structs []*Struct doc.Find(".item-table .struct, .struct").Each(func(_ int, s *goquery.Selection) { st := &Struct{} st.Name = strings.TrimSpace(s.Find("a.struct").Text()) if st.Name == "" { st.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { st.DocURL = resolveURL(p.baseURL, href) } st.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) st.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if st.Name != "" { structs = append(structs, st) } }) return structs } func (p *Parser) extractEnums(doc *goquery.Document) []*Enum { var enums []*Enum doc.Find(".item-table .enum, .enum").Each(func(_ int, s *goquery.Selection) { e := &Enum{} e.Name = strings.TrimSpace(s.Find("a.enum").Text()) if e.Name == "" { e.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { e.DocURL = resolveURL(p.baseURL, href) } e.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) e.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if e.Name != "" { enums = append(enums, e) } }) return enums } func (p *Parser) extractTraits(doc *goquery.Document) []*Trait { var traits []*Trait doc.Find(".item-table .trait, .trait").Each(func(_ int, s *goquery.Selection) { t := &Trait{} t.Name = strings.TrimSpace(s.Find("a.trait").Text()) if t.Name == "" { t.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { t.DocURL = resolveURL(p.baseURL, href) } t.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) t.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if t.Name != "" { traits = append(traits, t) } }) return traits } func (p *Parser) extractFunctions(doc *goquery.Document) []*Func { var funcs []*Func doc.Find(".item-table .fn, .fn, .function").Each(func(_ int, s *goquery.Selection) { f := &Func{} f.Name = strings.TrimSpace(s.Find("a.fn").Text()) if f.Name == "" { f.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { f.DocURL = resolveURL(p.baseURL, href) } f.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) f.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 f.IsUnsafe = strings.Contains(s.Text(), "unsafe") if f.Name != "" { funcs = append(funcs, f) } }) return funcs } func (p *Parser) extractMacros(doc *goquery.Document) []*Macro { var macros []*Macro doc.Find(".item-table .macro, .macro").Each(func(_ int, s *goquery.Selection) { m := &Macro{} m.Name = strings.TrimSpace(s.Find("a.macro").Text()) if m.Name == "" { m.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { m.DocURL = resolveURL(p.baseURL, href) } m.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) m.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if m.Name != "" { macros = append(macros, m) } }) return macros } func (p *Parser) extractConstants(doc *goquery.Document) []*Const { var constants []*Const doc.Find(".item-table .constant, .constant").Each(func(_ int, s *goquery.Selection) { c := &Const{} c.Name = strings.TrimSpace(s.Find("a.constant").Text()) if c.Name == "" { c.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { c.DocURL = resolveURL(p.baseURL, href) } c.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) c.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if c.Name != "" { constants = append(constants, c) } }) return constants } func (p *Parser) extractStatics(doc *goquery.Document) []*Static { var statics []*Static doc.Find(".item-table .static, .static").Each(func(_ int, s *goquery.Selection) { st := &Static{} st.Name = strings.TrimSpace(s.Find("a.static").Text()) if st.Name == "" { st.Name = strings.TrimSpace(s.Find("a").First().Text()) } if href, exists := s.Find("a").First().Attr("href"); exists { st.DocURL = resolveURL(p.baseURL, href) } st.Doc = strings.TrimSpace(s.Find(".desc, .item-desc").Text()) st.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if st.Name != "" { statics = append(statics, st) } }) return statics } func (p *Parser) ExtractMethods(doc *goquery.Document) []*Method { var methods []*Method doc.Find(".impl-items .method-toggle, details.method-toggle").Each(func(_ int, s *goquery.Selection) { m := &Method{} m.Name = strings.TrimSpace(s.Find(".fn, .method, h4.code-header").Text()) if m.Name == "" { section := s.Find("section.method") m.Name = strings.TrimSpace(section.Find(".fn").Text()) } sig := s.Find("pre, .code-header, h4.code-header") m.Signature = strings.TrimSpace(sig.Text()) m.Doc = strings.TrimSpace(s.Find(".docblock").Text()) m.IsUnsafe = strings.Contains(m.Signature, "unsafe") m.IsAsync = strings.Contains(m.Signature, "async") m.IsConst = strings.Contains(m.Signature, "const") m.IsExperimental = s.Find(".stab.unstable, .stab.experimental").Length() > 0 if m.Name != "" { methods = append(methods, m) } }) return methods } func (p *Parser) ExtractStructFields(doc *goquery.Document) []*Field { var fields []*Field doc.Find(".struct .fields tr, .struct-member").Each(func(_ int, s *goquery.Selection) { f := &Field{} f.Name = strings.TrimSpace(s.Find(".structfield, td:first-child").Text()) f.Type = strings.TrimSpace(s.Find(".type, td:nth-child(2)").Text()) f.Doc = strings.TrimSpace(s.Find(".docblock, td:last-child").Text()) f.IsPub = strings.Contains(s.Text(), "pub") if f.Name != "" { fields = append(fields, f) } }) return fields } func (p *Parser) ExtractEnumVariants(doc *goquery.Document) []*Variant { var variants []*Variant doc.Find(".enum .variants li, .variant").Each(func(_ int, s *goquery.Selection) { v := &Variant{} v.Name = strings.TrimSpace(s.Find("a, .variant-name").Text()) if v.Name == "" { v.Name = strings.TrimSpace(s.Text()) } v.Doc = strings.TrimSpace(s.Find(".docblock").Text()) sig := s.Text() v.IsTuple = strings.Contains(sig, "(") && !strings.Contains(sig, "{") v.IsStruct = strings.Contains(sig, "{") v.IsUnit = !v.IsTuple && !v.IsStruct if v.Name != "" { variants = append(variants, v) } }) return variants } func extractKindFromClasses(classes string) string { classList := strings.Fields(classes) for _, c := range classList { switch { case strings.HasPrefix(c, "result-"): kind := strings.TrimPrefix(c, "result-") switch kind { case "struct", "enum", "trait", "fn", "macro", "const", "static", "mod", "type", "primitive", "keyword", "attr": return kind case "method": return "fn" case "externcrate": return "mod" } } } return "" } func resolveURL(base string, href string) string { if strings.HasPrefix(href, "http") { return href } baseURL, err := url.Parse(base) if err != nil { return href } hrefURL, err := url.Parse(href) if err != nil { return href } return baseURL.ResolveReference(hrefURL).String() } func cleanText(text string) string { re := regexp.MustCompile(`\s+`) text = re.ReplaceAllString(text, " ") return strings.TrimSpace(text) }