package services import ( "crypto/md5" "fmt" "io" "net/http" "net/url" "regexp" "strings" "time" ) // WebsiteMetadata represents extracted website information type WebsiteMetadata struct { Title string `json:"title"` Description string `json:"description"` Favicon string `json:"favicon"` SiteName string `json:"site_name"` Image string `json:"image"` Author string `json:"author"` PublishedAt string `json:"published_at"` } // FetchWebsiteMetadata extracts metadata from a URL func FetchWebsiteMetadata(targetURL string) (*WebsiteMetadata, error) { // Parse URL to ensure it's valid _, err := url.Parse(targetURL) if err != nil { return nil, fmt.Errorf("invalid URL: %w", err) } // Create HTTP client with timeout client := &http.Client{ Timeout: 10 * time.Second, } // Make request req, err := http.NewRequest("GET", targetURL, nil) if err != nil { return nil, fmt.Errorf("failed to create request: %w", err) } // Set user agent to avoid being blocked req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") resp, err := client.Do(req) if err != nil { return nil, fmt.Errorf("failed to fetch URL: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status) } // Read response body body, err := io.ReadAll(resp.Body) if err != nil { return nil, fmt.Errorf("failed to read response body: %w", err) } content := string(body) metadata := &WebsiteMetadata{} // Extract Open Graph and Twitter Card metadata metadata = extractOpenGraphMetadata(content, metadata) metadata = extractTwitterMetadata(content, metadata) metadata = extractBasicHTMLMetadata(content, metadata) // Extract favicon using enhanced fetcher if metadata.Favicon == "" { if favicon, err := GetFavicon(targetURL); err == nil && favicon != "" { metadata.Favicon = favicon } } return metadata, nil } // extractOpenGraphMetadata extracts Open Graph meta tags func extractOpenGraphMetadata(content string, metadata *WebsiteMetadata) *WebsiteMetadata { // This is a simple implementation - in production, you might want to use a proper HTML parser ogPatterns := map[string]string{ `]+property=["']og:title["'][^>]+content=["']([^"']+)["']`: "Title", `]+property=["']og:description["'][^>]+content=["']([^"']+)["']`: "Description", `]+property=["']og:image["'][^>]+content=["']([^"']+)["']`: "Image", `]+property=["']og:site_name["'][^>]+content=["']([^"']+)["']`: "SiteName", `]+property=["']article:author["'][^>]+content=["']([^"']+)["']`: "Author", `]+property=["']article:published_time["'][^>]+content=["']([^"']+)["']`: "PublishedAt", } for pattern, field := range ogPatterns { if re := regexp.MustCompile(pattern); re != nil { if matches := re.FindStringSubmatch(content); len(matches) > 1 { switch field { case "Title": metadata.Title = matches[1] case "Description": metadata.Description = matches[1] case "Image": metadata.Image = matches[1] case "SiteName": metadata.SiteName = matches[1] case "Author": metadata.Author = matches[1] case "PublishedAt": metadata.PublishedAt = matches[1] } } } } return metadata } // extractTwitterMetadata extracts Twitter Card meta tags func extractTwitterMetadata(content string, metadata *WebsiteMetadata) *WebsiteMetadata { twitterPatterns := map[string]string{ `]+name=["']twitter:title["'][^>]+content=["']([^"']+)["']`: "Title", `]+name=["']twitter:description["'][^>]+content=["']([^"']+)["']`: "Description", `]+name=["']twitter:image["'][^>]+content=["']([^"']+)["']`: "Image", `]+name=["']twitter:site["'][^>]+content=["']([^"']+)["']`: "SiteName", `]+name=["']twitter:creator["'][^>]+content=["']([^"']+)["']`: "Author", } for pattern, field := range twitterPatterns { if re := regexp.MustCompile(pattern); re != nil { if matches := re.FindStringSubmatch(content); len(matches) > 1 { // Only set if not already set by Open Graph switch field { case "Title": if metadata.Title == "" { metadata.Title = matches[1] } case "Description": if metadata.Description == "" { metadata.Description = matches[1] } case "Image": if metadata.Image == "" { metadata.Image = matches[1] } case "SiteName": if metadata.SiteName == "" { metadata.SiteName = matches[1] } case "Author": if metadata.Author == "" { metadata.Author = matches[1] } } } } } return metadata } // extractBasicHTMLMetadata extracts basic HTML title and description func extractBasicHTMLMetadata(content string, metadata *WebsiteMetadata) *WebsiteMetadata { // Extract title if metadata.Title == "" { if re := regexp.MustCompile(`