first test

This commit is contained in:
Tomas Dvorak
2026-02-08 14:14:55 +01:00
parent 18aa702174
commit d27cf14110
372 changed files with 98089 additions and 2585 deletions
+77
View File
@@ -0,0 +1,77 @@
# YouTube Scraper Service
A standalone microservice for scraping YouTube video data. This service runs independently from the main Trackeep application.
## Features
- **Mock YouTube Data**: Provides mock YouTube video data for development and testing
- **Channel Videos**: Fetch videos from specific YouTube channels
- **Search**: Search through YouTube video metadata
- **REST API**: Simple REST endpoints for integration
## API Endpoints
### Health Check
```
GET /
```
Returns service status and information.
### Get Channel Videos
```
GET /channel_videos?channel={channel_name}
```
Fetches videos for a specific YouTube channel.
**Parameters:**
- `channel`: YouTube channel name (e.g., "@Fireship", "@NetworkChuck")
### Search Videos
```
GET /search?q={query}
```
Searches through video titles, descriptions, and channel names.
**Parameters:**
- `q`: Search query
## Running the Service
### Development
```bash
cd youtube-scraper
go run .
```
### Production
```bash
cd youtube-scraper
go build -o youtube-scraper .
./youtube-scraper
```
### Docker
```bash
docker build -f ../Dockerfile.youtube-scraper -t youtube-scraper ..
docker run -p 7857:7857 youtube-scraper
```
## Environment Variables
- `PORT`: Service port (default: 7857)
## Mock Data
The service includes mock data for popular tech YouTube channels:
- @Fireship
- @NetworkChuck
- @beyondfireship
- @LinusTechTips
- @Mrwhosetheboss
- @JerryRigEverything
- @JeffGeerling
- @mkbhd
## Integration
This service is designed to be called by the main Trackeep application via HTTP requests. The main app can be configured to use this service for YouTube-related features.
+32
View File
@@ -0,0 +1,32 @@
module youtube-scraper
go 1.21
require github.com/gin-gonic/gin v1.9.1
require (
github.com/bytedance/sonic v1.9.1 // indirect
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-playground/locales v0.14.1 // indirect
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.14.0 // indirect
github.com/goccy/go-json v0.10.2 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.2.4 // indirect
github.com/leodido/go-urn v1.2.4 // indirect
github.com/mattn/go-isatty v0.0.19 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.11 // indirect
golang.org/x/arch v0.3.0 // indirect
golang.org/x/crypto v0.9.0 // indirect
golang.org/x/net v0.10.0 // indirect
golang.org/x/sys v0.8.0 // indirect
golang.org/x/text v0.9.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
+86
View File
@@ -0,0 +1,86 @@
github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
golang.org/x/crypto v0.9.0 h1:LF6fAI+IutBocDJ2OT0Q1g8plpYljMZ4+lty+dsqw3g=
golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0=
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
+539
View File
@@ -0,0 +1,539 @@
package main
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"regexp"
"strconv"
"strings"
"time"
)
type VideoResponse struct {
VideoID string `json:"video_id"`
ChannelName string `json:"channel_name"`
}
var ctx = context.Background()
// ChannelVideosResponse represents the response for channel videos scraping
type ChannelVideosResponse struct {
Channel string `json:"channel"`
ChannelURL string `json:"channel_url"`
SubscribersText string `json:"subscribers_text"`
Subscribers int64 `json:"subscribers"`
Videos []VideoItem `json:"videos"`
}
// VideoItem holds per-video metadata extracted from the /videos page
type VideoItem struct {
VideoID string `json:"video_id"`
Title string `json:"title,omitempty"`
Length string `json:"length,omitempty"`
ThumbnailURL string `json:"thumbnail_url,omitempty"`
ViewsText string `json:"views_text,omitempty"`
Views int64 `json:"views"`
PublishedText string `json:"published_text,omitempty"`
PublishedDate string `json:"published_date,omitempty"` // ISO 8601 date
}
// normalizeChannelInput accepts a handle like "@FCBizoniUH" or "FCBizoniUH" or a full URL
// and returns the canonical handle (with leading @) and the corresponding /videos URL.
func normalizeChannelInput(input string) (handle string, url string) {
in := strings.TrimSpace(input)
lower := strings.ToLower(in)
isURL := strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") || strings.HasPrefix(lower, "www.") || strings.HasPrefix(lower, "youtube.com/")
if isURL {
// Ensure scheme
if strings.HasPrefix(lower, "www.") || strings.HasPrefix(lower, "youtube.com/") {
in = "https://" + strings.TrimPrefix(in, "www.")
if !strings.HasPrefix(strings.ToLower(in), "https://youtube.com/") && !strings.HasPrefix(strings.ToLower(in), "https://www.youtube.com/") {
in = "https://www." + strings.TrimPrefix(in, "https://")
}
}
// Normalize m.youtube.com -> www.youtube.com
in = strings.ReplaceAll(in, "m.youtube.com", "www.youtube.com")
// Extract handle if present
reHandle := regexp.MustCompile(`https?://(www\.)?youtube\.com/(@[^/]+)`) // group with @
if m := reHandle.FindStringSubmatch(in); len(m) >= 3 {
handle = m[2]
} else {
// Try path segment after domain
rePath := regexp.MustCompile(`https?://(www\.)?youtube\.com/([^/?#]+)`) // capture after domain
if m2 := rePath.FindStringSubmatch(in); len(m2) >= 3 {
seg := m2[2]
if strings.HasPrefix(seg, "@") {
handle = seg
} else {
handle = "@" + seg
}
}
}
// Respect provided tab if present: /videos, /shorts, /streams; default to /videos
if strings.Contains(strings.ToLower(in), "/videos") || strings.Contains(strings.ToLower(in), "/shorts") || strings.Contains(strings.ToLower(in), "/streams") {
url = in
} else {
// Build a /videos URL from detected handle
if handle == "" {
// If we couldn't find a handle, just use the original URL
url = in
} else {
url = fmt.Sprintf("https://www.youtube.com/%s/videos", handle)
}
}
} else {
// Not a URL; treat as handle or bare identifier
if strings.HasPrefix(in, "@") {
handle = in
} else {
handle = "@" + in
}
url = fmt.Sprintf("https://www.youtube.com/%s/videos", handle)
}
if handle == "" {
// As a final fallback from given input
handle = in
if !strings.HasPrefix(handle, "@") {
handle = "@" + handle
}
}
return
}
// fetchChannelVideos scrapes the channel's /videos page and extracts video IDs present
func fetchChannelVideos(channelInput string) (ChannelVideosResponse, error) {
handle, channelURL := normalizeChannelInput(channelInput)
log.Printf("Fetching channel videos: handle=%s url=%s", handle, channelURL)
// Craft request with a desktop UA to improve likelihood of getting full HTML payload
req, err := http.NewRequest("GET", channelURL, nil)
if err != nil {
return ChannelVideosResponse{}, err
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
resp, err := http.DefaultClient.Do(req)
if err != nil {
return ChannelVideosResponse{}, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return ChannelVideosResponse{}, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return ChannelVideosResponse{}, err
}
html := string(body)
// Regex to capture all 11-char YouTube video IDs from initial data payload
// Standard videos
vidRe := regexp.MustCompile(`"videoRenderer":\{[^}]*?"videoId":"([a-zA-Z0-9_-]{11})"`)
matches := vidRe.FindAllStringSubmatchIndex(html, -1)
seen := make(map[string]struct{})
var videos []VideoItem
for _, idx := range matches {
if len(idx) < 4 { // need at least match start/end and group start/end
continue
}
// Extract ID
id := html[idx[2]:idx[3]]
if _, ok := seen[id]; ok {
continue
}
seen[id] = struct{}{}
// Build a local window around the match to parse related fields
start := idx[0]
if start-2000 > 0 {
start = start - 2000
}
end := idx[1] + 8000
if end > len(html) {
end = len(html)
}
snippet := html[start:end]
vi := VideoItem{VideoID: id}
// Prefer deterministic thumbnail URL derived from video ID
vi.ThumbnailURL = fmt.Sprintf("https://img.youtube.com/vi/%s/maxresdefault.jpg", id)
// Title (may appear as simpleText or runs)
if m := regexp.MustCompile(`"title":\{"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.Title = unescapeYT(m[1])
} else if m := regexp.MustCompile(`"title":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.Title = unescapeYT(m[1])
}
// Length
if m := regexp.MustCompile(`"lengthText":\{[^}]*"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
// Generic lengthText.simpleText (with or without accessibility block)
vi.Length = m[1]
} else if m := regexp.MustCompile(`"lengthText":\{[^}]*"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
// lengthText.runs[0].text
vi.Length = m[1]
} else if m := regexp.MustCompile(`"thumbnailOverlays":\[[^\]]*?"thumbnailOverlayTimeStatusRenderer":\{"text":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
// Overlay badge duration
vi.Length = m[1]
} else if m := regexp.MustCompile(`yt-badge-shape__text">([^<]+)<`).FindStringSubmatch(snippet); len(m) >= 2 {
// Fallback: raw HTML badge text seen in thumbnails
vi.Length = strings.TrimSpace(m[1])
}
// Extra fallback: search the global HTML near the video anchor for DOM-based duration
if vi.Length == "" {
anchorRe := regexp.MustCompile(fmt.Sprintf(`<a[^>]+href="/watch\?v=%s[^\"]*"`, regexp.QuoteMeta(id)))
if loc := anchorRe.FindStringIndex(html); loc != nil {
// Search a forward window after the anchor for duration elements
start2 := loc[1]
end2 := start2 + 4000
if end2 > len(html) {
end2 = len(html)
}
chunk := html[start2:end2]
// Try yt-formatted-string id="length" inner text like 5:59
if m := regexp.MustCompile(`yt-formatted-string[^>]*id="length"[^>]*>([0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?)<`).FindStringSubmatch(chunk); len(m) >= 2 {
vi.Length = strings.TrimSpace(m[1])
} else if m := regexp.MustCompile(`yt-formatted-string[^>]*id="length"[^>]*aria-label="([^"]+)"`).FindStringSubmatch(chunk); len(m) >= 2 {
if parsed := parseLocalizedDuration(unescapeYT(m[1])); parsed != "" {
vi.Length = parsed
}
} else if m := regexp.MustCompile(`yt-badge-shape__text">([^<]+)<`).FindStringSubmatch(chunk); len(m) >= 2 {
vi.Length = strings.TrimSpace(m[1])
}
}
}
// Thumbnail URL (first in thumbnails array) as a fallback only if not set
if vi.ThumbnailURL == "" {
if m := regexp.MustCompile(`"thumbnail":\{"thumbnails":\[\{"url":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.ThumbnailURL = normalizeThumbURL(unescapeYT(m[1]))
}
}
// Published time text (e.g., "3 days ago")
if m := regexp.MustCompile(`"publishedTimeText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.PublishedText = m[1]
vi.PublishedDate = parseRelativeToISO(m[1])
}
// Views
if m := regexp.MustCompile(`"viewCountText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.ViewsText = m[1]
vi.Views = parseCountText(m[1])
} else if m := regexp.MustCompile(`"viewCountText":\{"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
vi.ViewsText = m[1] + " views"
vi.Views = parseCountText(m[1])
}
videos = append(videos, vi)
}
// Attempt to derive a displayable channel handle/name
channelDisplay := handle
// Try to extract canonicalBaseUrl if present
canRe := regexp.MustCompile(`"canonicalBaseUrl":"\\/(@[^\"]+)"`)
if m := canRe.FindStringSubmatch(html); len(m) >= 2 {
channelDisplay = m[1]
}
// Extract subscribers (header section)
subText := ""
// Try simpleText first
if m := regexp.MustCompile(`"subscriberCountText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(html); len(m) >= 2 {
subText = m[1]
} else {
// Try runs: join all text segments inside subscriberCountText.runs
if loc := regexp.MustCompile(`"subscriberCountText":\{"runs":\[`).FindStringIndex(html); loc != nil {
// Take a slice starting at runs and limited length
slice := html[loc[1]:]
// Find the closing ]
if endIdx := strings.Index(slice, "]}"); endIdx != -1 {
runsChunk := slice[:endIdx]
// Collect all text fields inside runs
texts := regexp.MustCompile(`"text":"([^"]+)"`).FindAllStringSubmatch(runsChunk, -1)
var parts []string
for _, t := range texts {
if len(t) >= 2 {
parts = append(parts, unescapeYT(t[1]))
}
}
subText = strings.Join(parts, "")
}
}
}
// Fallbacks: approximateSubscriberCount or localized patterns like "131 odběratelů"
if subText == "" {
if m := regexp.MustCompile(`"approximateSubscriberCount":"([^"]+)"`).FindStringSubmatch(html); len(m) >= 2 {
subText = m[1]
}
}
if subText == "" {
// Case-insensitive; match digits with optional spaces/commas/dots before localized label
if m := regexp.MustCompile(`(?i)([0-9][0-9\s\.,]*)\s*(odběratel(?:é|ů)?|subscribers?)`).FindStringSubmatch(html); len(m) >= 2 {
subText = strings.TrimSpace(m[0])
}
}
subs := parseCountText(subText)
res := ChannelVideosResponse{
Channel: channelDisplay,
ChannelURL: channelURL,
SubscribersText: subText,
Subscribers: subs,
Videos: videos,
}
return res, nil
}
// unescapeYT fixes escaped sequences in YouTube HTML JSON strings
func unescapeYT(s string) string {
s = strings.ReplaceAll(s, `\/`, `/`)
s = strings.ReplaceAll(s, `\u0026`, `&`)
return s
}
// normalizeThumbURL ensures thumbnails use https and removes query artifacts if needed
func normalizeThumbURL(u string) string {
u = unescapeYT(u)
if strings.HasPrefix(u, "//") {
u = "https:" + u
}
return u
}
// parseRelativeToISO converts strings like "3 days ago", "2 weeks ago", "1 year ago" to ISO date (yyyy-mm-dd)
func parseRelativeToISO(rel string) string {
now := time.Now()
lower := strings.ToLower(rel)
re := regexp.MustCompile(`(\d+)[\s-]*(second|minute|hour|day|week|month|year)s?\s+ago`)
if m := re.FindStringSubmatch(lower); len(m) >= 3 {
n, _ := strconv.Atoi(m[1])
unit := m[2]
dur := time.Duration(0)
switch unit {
case "second":
dur = time.Duration(n) * time.Second
return now.Add(-dur).Format("2006-01-02")
case "minute":
dur = time.Duration(n) * time.Minute
return now.Add(-dur).Format("2006-01-02")
case "hour":
dur = time.Duration(n) * time.Hour
return now.Add(-dur).Format("2006-01-02")
case "day":
return now.AddDate(0, 0, -n).Format("2006-01-02")
case "week":
return now.AddDate(0, 0, -7*n).Format("2006-01-02")
case "month":
return now.AddDate(0, -n, 0).Format("2006-01-02")
case "year":
return now.AddDate(-n, 0, 0).Format("2006-01-02")
}
}
// Sometimes YouTube uses "Streamed X days ago" or "Premiered ..."
re2 := regexp.MustCompile(`(streamed|premiered|started|live)\s+(\d+)\s+(second|minute|hour|day|week|month|year)s?\s+ago`)
if m := re2.FindStringSubmatch(lower); len(m) >= 4 {
n, _ := strconv.Atoi(m[2])
unit := m[3]
switch unit {
case "second":
return now.Add(-time.Duration(n) * time.Second).Format("2006-01-02")
case "minute":
return now.Add(-time.Duration(n) * time.Minute).Format("2006-01-02")
case "hour":
return now.Add(-time.Duration(n) * time.Hour).Format("2006-01-02")
case "day":
return now.AddDate(0, 0, -n).Format("2006-01-02")
case "week":
return now.AddDate(0, 0, -7*n).Format("2006-01-02")
case "month":
return now.AddDate(0, -n, 0).Format("2006-01-02")
case "year":
return now.AddDate(-n, 0, 0).Format("2006-01-02")
}
}
return ""
}
// parseLocalizedDuration converts localized duration phrases (e.g., "5 minut a 59 sekund")
// into a mm:ss or hh:mm:ss string. Supports English and basic Czech variants.
func parseLocalizedDuration(s string) string {
t := strings.ToLower(strings.TrimSpace(s))
// Replace HTML entities and non-breaking spaces
t = strings.ReplaceAll(t, "&nbsp;", " ")
t = strings.ReplaceAll(t, "\u00a0", " ")
t = strings.TrimSpace(t)
// If already in 00:00 or 0:00:00 form, return as-is trimmed
if m := regexp.MustCompile(`^\d{1,2}:\d{2}(?::\d{2})?$`).FindString(t); m != "" {
return m
}
// Patterns like: 1 hour 2 minutes 3 seconds (EN)
// or Czech: 1 hodina/hodiny/hodin, 2 minuty/minut, 3 sekundy/sekund
// We'll extract numbers for h/m/s separately.
var h, m, sec int
// English capture
if mm := regexp.MustCompile(`(\d+)\s*hour`).FindStringSubmatch(t); len(mm) >= 2 {
h, _ = strconv.Atoi(mm[1])
}
if mm := regexp.MustCompile(`(\d+)\s*minute`).FindStringSubmatch(t); len(mm) >= 2 {
m, _ = strconv.Atoi(mm[1])
}
if mm := regexp.MustCompile(`(\d+)\s*second`).FindStringSubmatch(t); len(mm) >= 2 {
sec, _ = strconv.Atoi(mm[1])
}
// Czech capture
if mm := regexp.MustCompile(`(\d+)\s*hodin(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
if h == 0 {
h, _ = strconv.Atoi(mm[1])
}
}
if mm := regexp.MustCompile(`(\d+)\s*minut(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
if m == 0 {
m, _ = strconv.Atoi(mm[1])
}
}
if mm := regexp.MustCompile(`(\d+)\s*sekund(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
if sec == 0 {
sec, _ = strconv.Atoi(mm[1])
}
}
// If we still didn't parse anything but string contains a plain number like "5 minutes",
// ensure we at least capture minutes.
if h == 0 && m == 0 && sec == 0 {
if mm := regexp.MustCompile(`^(\d+)$`).FindStringSubmatch(t); len(mm) >= 2 {
m, _ = strconv.Atoi(mm[1])
}
}
// Build the time string
if h > 0 {
return fmt.Sprintf("%d:%02d:%02d", h, m, sec)
}
if m > 0 || sec > 0 {
return fmt.Sprintf("%d:%02d", m, sec)
}
return ""
}
// parseCountText handles strings like "1,234 views", "12K subscribers", "3.4M"
func parseCountText(s string) int64 {
t := strings.ToLower(strings.TrimSpace(s))
// keep only the first number token
re := regexp.MustCompile(`([0-9]+(?:\.[0-9]+)?)([kmb])?`)
if m := re.FindStringSubmatch(t); len(m) >= 2 {
numStr := m[1]
suf := ""
if len(m) >= 3 {
suf = m[2]
}
f, err := strconv.ParseFloat(numStr, 64)
if err != nil {
return 0
}
switch suf {
case "k":
f *= 1_000
case "m":
f *= 1_000_000
case "b":
f *= 1_000_000_000
}
return int64(f)
}
// Fallback: strip non-digits and parse
digits := regexp.MustCompile(`[^0-9]`).ReplaceAllString(t, "")
if digits == "" {
return 0
}
v, _ := strconv.ParseInt(digits, 10, 64)
return v
}
func channelVideosHandler(w http.ResponseWriter, r *http.Request) {
channel := r.URL.Query().Get("channel")
if channel == "" {
log.Println("Missing channel parameter")
http.Error(w, "Missing channel parameter. Provide a handle like @FCBizoniUH, FCBBizoniUH, or a full channel URL.", http.StatusBadRequest)
return
}
res, err := fetchChannelVideos(channel)
if err != nil {
log.Printf("Failed to fetch channel videos for %s: %v", channel, err)
http.Error(w, "Failed to fetch channel videos", http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(res)
}
// CORS Middleware
func corsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Set CORS headers
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
w.Header().Set("Access-Control-Allow-Headers", "Content-Type")
// Handle preflight requests
if r.Method == http.MethodOptions {
w.WriteHeader(http.StatusOK)
return
}
next.ServeHTTP(w, r)
})
}
func rootHandler(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "application/json")
response := map[string]interface{}{
"status": "ok",
"service": "YouTube Scraper",
"version": "1.0.0",
"endpoints": map[string]string{
"channel_videos": "/channel_videos?channel={handle_or_url}",
},
}
json.NewEncoder(w).Encode(response)
}
func main() {
port := os.Getenv("PORT")
if port == "" {
port = "7857"
}
mux := http.NewServeMux()
// Create a new mux with CORS middleware
handlerWithCORS := corsMiddleware(mux)
// Register routes on the original mux
mux.HandleFunc("/", rootHandler)
mux.HandleFunc("/channel_videos", channelVideosHandler)
log.Printf("YouTube Scraper starting on port %s", port)
log.Fatal(http.ListenAndServe(":"+port, handlerWithCORS))
}