Files
Trackeep/youtube-video-scraper/main.go
T
Tomas Dvorak d27cf14110 first test
2026-02-08 14:14:55 +01:00

229 lines
5.5 KiB
Go

package main
import (
"fmt"
"log"
"net/http"
"os"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/gin-gonic/gin"
)
// VideoInfo represents the extracted video information
type VideoInfo struct {
VideoID string `json:"video_id"`
Title string `json:"title"`
Channel string `json:"channel"`
Thumbnail string `json:"thumbnail_url"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
}
// ExtractVideoID extracts video ID from YouTube URL
func ExtractVideoID(url string) string {
// Handle various YouTube URL formats
if strings.Contains(url, "youtu.be/") {
// https://youtu.be/VIDEO_ID
parts := strings.Split(url, "youtu.be/")
if len(parts) > 1 {
return strings.Split(parts[1], "?")[0]
}
} else if strings.Contains(url, "youtube.com/watch") {
// https://www.youtube.com/watch?v=VIDEO_ID
parts := strings.Split(url, "v=")
if len(parts) > 1 {
return strings.Split(parts[1], "&")[0]
}
} else if strings.Contains(url, "youtube.com/embed/") {
// https://www.youtube.com/embed/VIDEO_ID
parts := strings.Split(url, "embed/")
if len(parts) > 1 {
return strings.Split(parts[1], "?")[0]
}
}
return ""
}
// ScrapeVideoInfo scrapes video information from YouTube URL
func ScrapeVideoInfo(url string) VideoInfo {
videoID := ExtractVideoID(url)
fmt.Printf("Extracted video ID: '%s' from URL: %s\n", videoID, url)
if videoID == "" {
return VideoInfo{
Success: false,
Error: "Invalid YouTube URL",
}
}
// Create HTTP client with user agent
client := &http.Client{}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return VideoInfo{
Success: false,
Error: fmt.Sprintf("Failed to create request: %v", err),
}
}
// Set user agent to mimic browser
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
resp, err := client.Do(req)
if err != nil {
return VideoInfo{
Success: false,
Error: fmt.Sprintf("Failed to fetch page: %v", err),
}
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
return VideoInfo{
Success: false,
Error: fmt.Sprintf("HTTP %d: %s", resp.StatusCode, resp.Status),
}
}
// Parse HTML
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return VideoInfo{
Success: false,
Error: fmt.Sprintf("Failed to parse HTML: %v", err),
}
}
// Extract title
title := ""
doc.Find("h1.ytd-watch-metadata yt-formatted-string").Each(func(i int, s *goquery.Selection) {
title = strings.TrimSpace(s.Text())
})
// Fallback for title extraction - try multiple selectors
if title == "" {
doc.Find("h1").Each(func(i int, s *goquery.Selection) {
text := strings.TrimSpace(s.Text())
if text != "" && !strings.Contains(text, "YouTube") {
title = text
}
})
}
// Another fallback - try title tag
if title == "" {
doc.Find("title").Each(func(i int, s *goquery.Selection) {
text := strings.TrimSpace(s.Text())
if strings.Contains(text, " - YouTube") {
title = strings.Replace(text, " - YouTube", "", 1)
} else if !strings.Contains(text, "YouTube") {
title = text
}
})
}
// Extract channel name
channel := ""
doc.Find("ytd-video-owner-renderer yt-formatted-string a").Each(func(i int, s *goquery.Selection) {
channel = strings.TrimSpace(s.Text())
})
// Fallback for channel extraction - try multiple selectors
if channel == "" {
doc.Find("ytd-channel-name yt-formatted-string a").Each(func(i int, s *goquery.Selection) {
channel = strings.TrimSpace(s.Text())
})
}
// Another fallback - try channel link
if channel == "" {
doc.Find("a[href*='/@']").Each(func(i int, s *goquery.Selection) {
text := strings.TrimSpace(s.Text())
if text != "" && !strings.Contains(text, " ") {
channel = text
}
})
}
// Generate thumbnail URL
thumbnail := fmt.Sprintf("https://i.ytimg.com/vi/%s/hqdefault.jpg", videoID)
if title == "" || channel == "" {
return VideoInfo{
Success: false,
Error: "Could not extract title or channel",
}
}
return VideoInfo{
VideoID: videoID,
Title: title,
Channel: channel,
Thumbnail: thumbnail,
Success: true,
}
}
// VideoHandler handles video scraping requests
func VideoHandler(c *gin.Context) {
var request struct {
URL string `json:"url" binding:"required"`
}
if err := c.ShouldBindJSON(&request); err != nil {
fmt.Printf("Error binding request: %v\n", err)
c.JSON(http.StatusBadRequest, gin.H{
"success": false,
"error": "Invalid request format",
})
return
}
fmt.Printf("Received request for URL: %s\n", request.URL)
info := ScrapeVideoInfo(request.URL)
c.JSON(http.StatusOK, info)
}
func main() {
port := "7858"
if p := os.Getenv("PORT"); p != "" {
port = p
}
r := gin.Default()
// Enable CORS for all origins
r.Use(func(c *gin.Context) {
c.Header("Access-Control-Allow-Origin", "*")
c.Header("Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS")
c.Header("Access-Control-Allow-Headers", "Content-Type, Authorization")
if c.Request.Method == "OPTIONS" {
c.AbortWithStatus(204)
return
}
c.Next()
})
// Routes
r.POST("/video", VideoHandler)
r.GET("/health", func(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"status": "healthy"})
})
r.GET("/test", func(c *gin.Context) {
testURL := "https://www.youtube.com/watch?v=WKXh4Z6SYMs"
videoID := ExtractVideoID(testURL)
c.JSON(http.StatusOK, gin.H{
"url": testURL,
"video_id": videoID,
})
})
log.Printf("YouTube video scraper starting on port %s", port)
log.Fatal(r.Run(":" + port))
}