mirror of
https://github.com/Dvorinka/Trackeep.git
synced 2026-06-03 20:12:58 +00:00
first test
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
# YouTube Scraper Service
|
||||
|
||||
A standalone microservice for scraping YouTube video data. This service runs independently from the main Trackeep application.
|
||||
|
||||
## Features
|
||||
|
||||
- **Mock YouTube Data**: Provides mock YouTube video data for development and testing
|
||||
- **Channel Videos**: Fetch videos from specific YouTube channels
|
||||
- **Search**: Search through YouTube video metadata
|
||||
- **REST API**: Simple REST endpoints for integration
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Health Check
|
||||
```
|
||||
GET /
|
||||
```
|
||||
Returns service status and information.
|
||||
|
||||
### Get Channel Videos
|
||||
```
|
||||
GET /channel_videos?channel={channel_name}
|
||||
```
|
||||
Fetches videos for a specific YouTube channel.
|
||||
|
||||
**Parameters:**
|
||||
- `channel`: YouTube channel name (e.g., "@Fireship", "@NetworkChuck")
|
||||
|
||||
### Search Videos
|
||||
```
|
||||
GET /search?q={query}
|
||||
```
|
||||
Searches through video titles, descriptions, and channel names.
|
||||
|
||||
**Parameters:**
|
||||
- `q`: Search query
|
||||
|
||||
## Running the Service
|
||||
|
||||
### Development
|
||||
```bash
|
||||
cd youtube-scraper
|
||||
go run .
|
||||
```
|
||||
|
||||
### Production
|
||||
```bash
|
||||
cd youtube-scraper
|
||||
go build -o youtube-scraper .
|
||||
./youtube-scraper
|
||||
```
|
||||
|
||||
### Docker
|
||||
```bash
|
||||
docker build -f ../Dockerfile.youtube-scraper -t youtube-scraper ..
|
||||
docker run -p 7857:7857 youtube-scraper
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `PORT`: Service port (default: 7857)
|
||||
|
||||
## Mock Data
|
||||
|
||||
The service includes mock data for popular tech YouTube channels:
|
||||
- @Fireship
|
||||
- @NetworkChuck
|
||||
- @beyondfireship
|
||||
- @LinusTechTips
|
||||
- @Mrwhosetheboss
|
||||
- @JerryRigEverything
|
||||
- @JeffGeerling
|
||||
- @mkbhd
|
||||
|
||||
## Integration
|
||||
|
||||
This service is designed to be called by the main Trackeep application via HTTP requests. The main app can be configured to use this service for YouTube-related features.
|
||||
@@ -0,0 +1,32 @@
|
||||
module youtube-scraper
|
||||
|
||||
go 1.21
|
||||
|
||||
require github.com/gin-gonic/gin v1.9.1
|
||||
|
||||
require (
|
||||
github.com/bytedance/sonic v1.9.1 // indirect
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 // indirect
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 // indirect
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
github.com/go-playground/locales v0.14.1 // indirect
|
||||
github.com/go-playground/universal-translator v0.18.1 // indirect
|
||||
github.com/go-playground/validator/v10 v10.14.0 // indirect
|
||||
github.com/goccy/go-json v0.10.2 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.2.4 // indirect
|
||||
github.com/leodido/go-urn v1.2.4 // indirect
|
||||
github.com/mattn/go-isatty v0.0.19 // indirect
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 // indirect
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
|
||||
github.com/ugorji/go/codec v1.2.11 // indirect
|
||||
golang.org/x/arch v0.3.0 // indirect
|
||||
golang.org/x/crypto v0.9.0 // indirect
|
||||
golang.org/x/net v0.10.0 // indirect
|
||||
golang.org/x/sys v0.8.0 // indirect
|
||||
golang.org/x/text v0.9.0 // indirect
|
||||
google.golang.org/protobuf v1.30.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
||||
@@ -0,0 +1,86 @@
|
||||
github.com/bytedance/sonic v1.5.0/go.mod h1:ED5hyg4y6t3/9Ku1R6dU/4KyJ48DZ4jPhfY1O2AihPM=
|
||||
github.com/bytedance/sonic v1.9.1 h1:6iJ6NqdoxCDr6mbY8h18oSO+cShGSMRGCEo7F2h0x8s=
|
||||
github.com/bytedance/sonic v1.9.1/go.mod h1:i736AoUSYt75HyZLoJW9ERYxcy6eaN6h4BZXU064P/U=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20211019084208-fb5309c8db06/go.mod h1:DH46F32mSOjUmXrMHnKwZdA8wcEefY7UVqBKYGjpdQY=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311 h1:qSGYFH7+jGhDF8vLC+iwCD4WpbV1EBDSzWkJODFLams=
|
||||
github.com/chenzhuoyu/base64x v0.0.0-20221115062448-fe3a3abad311/go.mod h1:b583jCggY9gE99b6G5LEC39OIiVsWj+R97kbl5odCEk=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
|
||||
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
|
||||
github.com/gin-contrib/sse v0.1.0 h1:Y/yl/+YNO8GZSjAhjMsSuLt29uWRFHdHYUb5lYOV9qE=
|
||||
github.com/gin-contrib/sse v0.1.0/go.mod h1:RHrZQHXnP2xjPF+u1gW/2HnVO7nvIa9PG3Gm+fLHvGI=
|
||||
github.com/gin-gonic/gin v1.9.1 h1:4idEAncQnU5cB7BeOkPtxjfCSye0AAm1R0RVIqJ+Jmg=
|
||||
github.com/gin-gonic/gin v1.9.1/go.mod h1:hPrL7YrpYKXt5YId3A/Tnip5kqbEAP+KLuI3SUcPTeU=
|
||||
github.com/go-playground/assert/v2 v2.2.0 h1:JvknZsQTYeFEAhQwI4qEt9cyV5ONwRHC+lYKSsYSR8s=
|
||||
github.com/go-playground/assert/v2 v2.2.0/go.mod h1:VDjEfimB/XKnb+ZQfWdccd7VUvScMdVu0Titje2rxJ4=
|
||||
github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/oXslEjJA=
|
||||
github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY=
|
||||
github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY=
|
||||
github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY=
|
||||
github.com/go-playground/validator/v10 v10.14.0 h1:vgvQWe3XCz3gIeFDm/HnTIbj6UGmg/+t63MyGU2n5js=
|
||||
github.com/go-playground/validator/v10 v10.14.0/go.mod h1:9iXMNT7sEkjXb0I+enO7QXmzG6QCsPWY4zveKFVRSyU=
|
||||
github.com/goccy/go-json v0.10.2 h1:CrxCmQqYDkv1z7lO7Wbh2HN93uovUHgrECaO5ZrCXAU=
|
||||
github.com/goccy/go-json v0.10.2/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/google/go-cmp v0.5.5 h1:Khx7svrCpmxxtHBq5j2mp/xVjsi8hQMfNLvJFAlrGgU=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
|
||||
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
|
||||
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
|
||||
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
|
||||
github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
|
||||
github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
|
||||
github.com/leodido/go-urn v1.2.4 h1:XlAE/cm/ms7TE/VMVoduSpNBoyc2dOxHs5MZSwAN63Q=
|
||||
github.com/leodido/go-urn v1.2.4/go.mod h1:7ZrI8mTSeBSHl/UaRyKQW1qZeMgak41ANeCNaVckg+4=
|
||||
github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA=
|
||||
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
|
||||
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
|
||||
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
|
||||
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8 h1:0ctb6s9mE31h0/lhu+J6OPmVeDxJn+kYnJc2jZR9tGQ=
|
||||
github.com/pelletier/go-toml/v2 v2.0.8/go.mod h1:vuYfssBdrU2XDZ9bYydBu6t+6a6PYNcZljzZR9VXg+4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
|
||||
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
|
||||
github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
|
||||
github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1 h1:SU5vSMR7hnwNxj24w34ZyCi/FmDZTkS4MhqMhdFk5YI=
|
||||
github.com/twitchyliquid64/golang-asm v0.15.1/go.mod h1:a1lVb/DtPvCB8fslRZhAngC2+aY1QWCk3Cedj/Gdt08=
|
||||
github.com/ugorji/go/codec v1.2.11 h1:BMaWp1Bb6fHwEtbplGBGJ498wD+LKlNSl25MjdZY4dU=
|
||||
github.com/ugorji/go/codec v1.2.11/go.mod h1:UNopzCgEMSXjBc6AOMqYvWC1ktqTAfzJZUZgYf6w6lg=
|
||||
golang.org/x/arch v0.0.0-20210923205945-b76863e36670/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/arch v0.3.0 h1:02VY4/ZcO/gBOH6PUaoiptASxtXU10jazRCP865E97k=
|
||||
golang.org/x/arch v0.3.0/go.mod h1:5om86z9Hs0C8fWVUuoMHwpExlXzs5Tkyp9hOrfG7pp8=
|
||||
golang.org/x/crypto v0.9.0 h1:LF6fAI+IutBocDJ2OT0Q1g8plpYljMZ4+lty+dsqw3g=
|
||||
golang.org/x/crypto v0.9.0/go.mod h1:yrmDGqONDYtNj3tH8X9dzUun2m2lzPa9ngI6/RUPGR0=
|
||||
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
|
||||
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
|
||||
golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
|
||||
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
|
||||
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
|
||||
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
|
||||
@@ -0,0 +1,539 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
type VideoResponse struct {
|
||||
VideoID string `json:"video_id"`
|
||||
ChannelName string `json:"channel_name"`
|
||||
}
|
||||
|
||||
var ctx = context.Background()
|
||||
|
||||
// ChannelVideosResponse represents the response for channel videos scraping
|
||||
type ChannelVideosResponse struct {
|
||||
Channel string `json:"channel"`
|
||||
ChannelURL string `json:"channel_url"`
|
||||
SubscribersText string `json:"subscribers_text"`
|
||||
Subscribers int64 `json:"subscribers"`
|
||||
Videos []VideoItem `json:"videos"`
|
||||
}
|
||||
|
||||
// VideoItem holds per-video metadata extracted from the /videos page
|
||||
type VideoItem struct {
|
||||
VideoID string `json:"video_id"`
|
||||
Title string `json:"title,omitempty"`
|
||||
Length string `json:"length,omitempty"`
|
||||
ThumbnailURL string `json:"thumbnail_url,omitempty"`
|
||||
ViewsText string `json:"views_text,omitempty"`
|
||||
Views int64 `json:"views"`
|
||||
PublishedText string `json:"published_text,omitempty"`
|
||||
PublishedDate string `json:"published_date,omitempty"` // ISO 8601 date
|
||||
}
|
||||
|
||||
// normalizeChannelInput accepts a handle like "@FCBizoniUH" or "FCBizoniUH" or a full URL
|
||||
// and returns the canonical handle (with leading @) and the corresponding /videos URL.
|
||||
func normalizeChannelInput(input string) (handle string, url string) {
|
||||
in := strings.TrimSpace(input)
|
||||
lower := strings.ToLower(in)
|
||||
isURL := strings.HasPrefix(lower, "http://") || strings.HasPrefix(lower, "https://") || strings.HasPrefix(lower, "www.") || strings.HasPrefix(lower, "youtube.com/")
|
||||
if isURL {
|
||||
// Ensure scheme
|
||||
if strings.HasPrefix(lower, "www.") || strings.HasPrefix(lower, "youtube.com/") {
|
||||
in = "https://" + strings.TrimPrefix(in, "www.")
|
||||
if !strings.HasPrefix(strings.ToLower(in), "https://youtube.com/") && !strings.HasPrefix(strings.ToLower(in), "https://www.youtube.com/") {
|
||||
in = "https://www." + strings.TrimPrefix(in, "https://")
|
||||
}
|
||||
}
|
||||
// Normalize m.youtube.com -> www.youtube.com
|
||||
in = strings.ReplaceAll(in, "m.youtube.com", "www.youtube.com")
|
||||
|
||||
// Extract handle if present
|
||||
reHandle := regexp.MustCompile(`https?://(www\.)?youtube\.com/(@[^/]+)`) // group with @
|
||||
if m := reHandle.FindStringSubmatch(in); len(m) >= 3 {
|
||||
handle = m[2]
|
||||
} else {
|
||||
// Try path segment after domain
|
||||
rePath := regexp.MustCompile(`https?://(www\.)?youtube\.com/([^/?#]+)`) // capture after domain
|
||||
if m2 := rePath.FindStringSubmatch(in); len(m2) >= 3 {
|
||||
seg := m2[2]
|
||||
if strings.HasPrefix(seg, "@") {
|
||||
handle = seg
|
||||
} else {
|
||||
handle = "@" + seg
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Respect provided tab if present: /videos, /shorts, /streams; default to /videos
|
||||
if strings.Contains(strings.ToLower(in), "/videos") || strings.Contains(strings.ToLower(in), "/shorts") || strings.Contains(strings.ToLower(in), "/streams") {
|
||||
url = in
|
||||
} else {
|
||||
// Build a /videos URL from detected handle
|
||||
if handle == "" {
|
||||
// If we couldn't find a handle, just use the original URL
|
||||
url = in
|
||||
} else {
|
||||
url = fmt.Sprintf("https://www.youtube.com/%s/videos", handle)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Not a URL; treat as handle or bare identifier
|
||||
if strings.HasPrefix(in, "@") {
|
||||
handle = in
|
||||
} else {
|
||||
handle = "@" + in
|
||||
}
|
||||
url = fmt.Sprintf("https://www.youtube.com/%s/videos", handle)
|
||||
}
|
||||
if handle == "" {
|
||||
// As a final fallback from given input
|
||||
handle = in
|
||||
if !strings.HasPrefix(handle, "@") {
|
||||
handle = "@" + handle
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// fetchChannelVideos scrapes the channel's /videos page and extracts video IDs present
|
||||
func fetchChannelVideos(channelInput string) (ChannelVideosResponse, error) {
|
||||
handle, channelURL := normalizeChannelInput(channelInput)
|
||||
log.Printf("Fetching channel videos: handle=%s url=%s", handle, channelURL)
|
||||
|
||||
// Craft request with a desktop UA to improve likelihood of getting full HTML payload
|
||||
req, err := http.NewRequest("GET", channelURL, nil)
|
||||
if err != nil {
|
||||
return ChannelVideosResponse{}, err
|
||||
}
|
||||
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.9")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return ChannelVideosResponse{}, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return ChannelVideosResponse{}, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return ChannelVideosResponse{}, err
|
||||
}
|
||||
html := string(body)
|
||||
|
||||
// Regex to capture all 11-char YouTube video IDs from initial data payload
|
||||
// Standard videos
|
||||
vidRe := regexp.MustCompile(`"videoRenderer":\{[^}]*?"videoId":"([a-zA-Z0-9_-]{11})"`)
|
||||
matches := vidRe.FindAllStringSubmatchIndex(html, -1)
|
||||
seen := make(map[string]struct{})
|
||||
var videos []VideoItem
|
||||
for _, idx := range matches {
|
||||
if len(idx) < 4 { // need at least match start/end and group start/end
|
||||
continue
|
||||
}
|
||||
// Extract ID
|
||||
id := html[idx[2]:idx[3]]
|
||||
if _, ok := seen[id]; ok {
|
||||
continue
|
||||
}
|
||||
seen[id] = struct{}{}
|
||||
|
||||
// Build a local window around the match to parse related fields
|
||||
start := idx[0]
|
||||
if start-2000 > 0 {
|
||||
start = start - 2000
|
||||
}
|
||||
end := idx[1] + 8000
|
||||
if end > len(html) {
|
||||
end = len(html)
|
||||
}
|
||||
snippet := html[start:end]
|
||||
|
||||
vi := VideoItem{VideoID: id}
|
||||
// Prefer deterministic thumbnail URL derived from video ID
|
||||
vi.ThumbnailURL = fmt.Sprintf("https://img.youtube.com/vi/%s/maxresdefault.jpg", id)
|
||||
|
||||
// Title (may appear as simpleText or runs)
|
||||
if m := regexp.MustCompile(`"title":\{"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.Title = unescapeYT(m[1])
|
||||
} else if m := regexp.MustCompile(`"title":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.Title = unescapeYT(m[1])
|
||||
}
|
||||
|
||||
// Length
|
||||
if m := regexp.MustCompile(`"lengthText":\{[^}]*"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
// Generic lengthText.simpleText (with or without accessibility block)
|
||||
vi.Length = m[1]
|
||||
} else if m := regexp.MustCompile(`"lengthText":\{[^}]*"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
// lengthText.runs[0].text
|
||||
vi.Length = m[1]
|
||||
} else if m := regexp.MustCompile(`"thumbnailOverlays":\[[^\]]*?"thumbnailOverlayTimeStatusRenderer":\{"text":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
// Overlay badge duration
|
||||
vi.Length = m[1]
|
||||
} else if m := regexp.MustCompile(`yt-badge-shape__text">([^<]+)<`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
// Fallback: raw HTML badge text seen in thumbnails
|
||||
vi.Length = strings.TrimSpace(m[1])
|
||||
}
|
||||
|
||||
// Extra fallback: search the global HTML near the video anchor for DOM-based duration
|
||||
if vi.Length == "" {
|
||||
anchorRe := regexp.MustCompile(fmt.Sprintf(`<a[^>]+href="/watch\?v=%s[^\"]*"`, regexp.QuoteMeta(id)))
|
||||
if loc := anchorRe.FindStringIndex(html); loc != nil {
|
||||
// Search a forward window after the anchor for duration elements
|
||||
start2 := loc[1]
|
||||
end2 := start2 + 4000
|
||||
if end2 > len(html) {
|
||||
end2 = len(html)
|
||||
}
|
||||
chunk := html[start2:end2]
|
||||
// Try yt-formatted-string id="length" inner text like 5:59
|
||||
if m := regexp.MustCompile(`yt-formatted-string[^>]*id="length"[^>]*>([0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?)<`).FindStringSubmatch(chunk); len(m) >= 2 {
|
||||
vi.Length = strings.TrimSpace(m[1])
|
||||
} else if m := regexp.MustCompile(`yt-formatted-string[^>]*id="length"[^>]*aria-label="([^"]+)"`).FindStringSubmatch(chunk); len(m) >= 2 {
|
||||
if parsed := parseLocalizedDuration(unescapeYT(m[1])); parsed != "" {
|
||||
vi.Length = parsed
|
||||
}
|
||||
} else if m := regexp.MustCompile(`yt-badge-shape__text">([^<]+)<`).FindStringSubmatch(chunk); len(m) >= 2 {
|
||||
vi.Length = strings.TrimSpace(m[1])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Thumbnail URL (first in thumbnails array) as a fallback only if not set
|
||||
if vi.ThumbnailURL == "" {
|
||||
if m := regexp.MustCompile(`"thumbnail":\{"thumbnails":\[\{"url":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.ThumbnailURL = normalizeThumbURL(unescapeYT(m[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// Published time text (e.g., "3 days ago")
|
||||
if m := regexp.MustCompile(`"publishedTimeText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.PublishedText = m[1]
|
||||
vi.PublishedDate = parseRelativeToISO(m[1])
|
||||
}
|
||||
|
||||
// Views
|
||||
if m := regexp.MustCompile(`"viewCountText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.ViewsText = m[1]
|
||||
vi.Views = parseCountText(m[1])
|
||||
} else if m := regexp.MustCompile(`"viewCountText":\{"runs":\[\{"text":"([^"]+)"`).FindStringSubmatch(snippet); len(m) >= 2 {
|
||||
vi.ViewsText = m[1] + " views"
|
||||
vi.Views = parseCountText(m[1])
|
||||
}
|
||||
|
||||
videos = append(videos, vi)
|
||||
}
|
||||
|
||||
// Attempt to derive a displayable channel handle/name
|
||||
channelDisplay := handle
|
||||
// Try to extract canonicalBaseUrl if present
|
||||
canRe := regexp.MustCompile(`"canonicalBaseUrl":"\\/(@[^\"]+)"`)
|
||||
if m := canRe.FindStringSubmatch(html); len(m) >= 2 {
|
||||
channelDisplay = m[1]
|
||||
}
|
||||
|
||||
// Extract subscribers (header section)
|
||||
subText := ""
|
||||
// Try simpleText first
|
||||
if m := regexp.MustCompile(`"subscriberCountText":\{"simpleText":"([^"]+)"`).FindStringSubmatch(html); len(m) >= 2 {
|
||||
subText = m[1]
|
||||
} else {
|
||||
// Try runs: join all text segments inside subscriberCountText.runs
|
||||
if loc := regexp.MustCompile(`"subscriberCountText":\{"runs":\[`).FindStringIndex(html); loc != nil {
|
||||
// Take a slice starting at runs and limited length
|
||||
slice := html[loc[1]:]
|
||||
// Find the closing ]
|
||||
if endIdx := strings.Index(slice, "]}"); endIdx != -1 {
|
||||
runsChunk := slice[:endIdx]
|
||||
// Collect all text fields inside runs
|
||||
texts := regexp.MustCompile(`"text":"([^"]+)"`).FindAllStringSubmatch(runsChunk, -1)
|
||||
var parts []string
|
||||
for _, t := range texts {
|
||||
if len(t) >= 2 {
|
||||
parts = append(parts, unescapeYT(t[1]))
|
||||
}
|
||||
}
|
||||
subText = strings.Join(parts, "")
|
||||
}
|
||||
}
|
||||
}
|
||||
// Fallbacks: approximateSubscriberCount or localized patterns like "131 odběratelů"
|
||||
if subText == "" {
|
||||
if m := regexp.MustCompile(`"approximateSubscriberCount":"([^"]+)"`).FindStringSubmatch(html); len(m) >= 2 {
|
||||
subText = m[1]
|
||||
}
|
||||
}
|
||||
if subText == "" {
|
||||
// Case-insensitive; match digits with optional spaces/commas/dots before localized label
|
||||
if m := regexp.MustCompile(`(?i)([0-9][0-9\s\.,]*)\s*(odběratel(?:é|ů)?|subscribers?)`).FindStringSubmatch(html); len(m) >= 2 {
|
||||
subText = strings.TrimSpace(m[0])
|
||||
}
|
||||
}
|
||||
subs := parseCountText(subText)
|
||||
|
||||
res := ChannelVideosResponse{
|
||||
Channel: channelDisplay,
|
||||
ChannelURL: channelURL,
|
||||
SubscribersText: subText,
|
||||
Subscribers: subs,
|
||||
Videos: videos,
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// unescapeYT fixes escaped sequences in YouTube HTML JSON strings
|
||||
func unescapeYT(s string) string {
|
||||
s = strings.ReplaceAll(s, `\/`, `/`)
|
||||
s = strings.ReplaceAll(s, `\u0026`, `&`)
|
||||
return s
|
||||
}
|
||||
|
||||
// normalizeThumbURL ensures thumbnails use https and removes query artifacts if needed
|
||||
func normalizeThumbURL(u string) string {
|
||||
u = unescapeYT(u)
|
||||
if strings.HasPrefix(u, "//") {
|
||||
u = "https:" + u
|
||||
}
|
||||
return u
|
||||
}
|
||||
|
||||
// parseRelativeToISO converts strings like "3 days ago", "2 weeks ago", "1 year ago" to ISO date (yyyy-mm-dd)
|
||||
func parseRelativeToISO(rel string) string {
|
||||
now := time.Now()
|
||||
lower := strings.ToLower(rel)
|
||||
re := regexp.MustCompile(`(\d+)[\s-]*(second|minute|hour|day|week|month|year)s?\s+ago`)
|
||||
if m := re.FindStringSubmatch(lower); len(m) >= 3 {
|
||||
n, _ := strconv.Atoi(m[1])
|
||||
unit := m[2]
|
||||
dur := time.Duration(0)
|
||||
switch unit {
|
||||
case "second":
|
||||
dur = time.Duration(n) * time.Second
|
||||
return now.Add(-dur).Format("2006-01-02")
|
||||
case "minute":
|
||||
dur = time.Duration(n) * time.Minute
|
||||
return now.Add(-dur).Format("2006-01-02")
|
||||
case "hour":
|
||||
dur = time.Duration(n) * time.Hour
|
||||
return now.Add(-dur).Format("2006-01-02")
|
||||
case "day":
|
||||
return now.AddDate(0, 0, -n).Format("2006-01-02")
|
||||
case "week":
|
||||
return now.AddDate(0, 0, -7*n).Format("2006-01-02")
|
||||
case "month":
|
||||
return now.AddDate(0, -n, 0).Format("2006-01-02")
|
||||
case "year":
|
||||
return now.AddDate(-n, 0, 0).Format("2006-01-02")
|
||||
}
|
||||
}
|
||||
// Sometimes YouTube uses "Streamed X days ago" or "Premiered ..."
|
||||
re2 := regexp.MustCompile(`(streamed|premiered|started|live)\s+(\d+)\s+(second|minute|hour|day|week|month|year)s?\s+ago`)
|
||||
if m := re2.FindStringSubmatch(lower); len(m) >= 4 {
|
||||
n, _ := strconv.Atoi(m[2])
|
||||
unit := m[3]
|
||||
switch unit {
|
||||
case "second":
|
||||
return now.Add(-time.Duration(n) * time.Second).Format("2006-01-02")
|
||||
case "minute":
|
||||
return now.Add(-time.Duration(n) * time.Minute).Format("2006-01-02")
|
||||
case "hour":
|
||||
return now.Add(-time.Duration(n) * time.Hour).Format("2006-01-02")
|
||||
case "day":
|
||||
return now.AddDate(0, 0, -n).Format("2006-01-02")
|
||||
case "week":
|
||||
return now.AddDate(0, 0, -7*n).Format("2006-01-02")
|
||||
case "month":
|
||||
return now.AddDate(0, -n, 0).Format("2006-01-02")
|
||||
case "year":
|
||||
return now.AddDate(-n, 0, 0).Format("2006-01-02")
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseLocalizedDuration converts localized duration phrases (e.g., "5 minut a 59 sekund")
|
||||
// into a mm:ss or hh:mm:ss string. Supports English and basic Czech variants.
|
||||
func parseLocalizedDuration(s string) string {
|
||||
t := strings.ToLower(strings.TrimSpace(s))
|
||||
// Replace HTML entities and non-breaking spaces
|
||||
t = strings.ReplaceAll(t, " ", " ")
|
||||
t = strings.ReplaceAll(t, "\u00a0", " ")
|
||||
t = strings.TrimSpace(t)
|
||||
|
||||
// If already in 00:00 or 0:00:00 form, return as-is trimmed
|
||||
if m := regexp.MustCompile(`^\d{1,2}:\d{2}(?::\d{2})?$`).FindString(t); m != "" {
|
||||
return m
|
||||
}
|
||||
|
||||
// Patterns like: 1 hour 2 minutes 3 seconds (EN)
|
||||
// or Czech: 1 hodina/hodiny/hodin, 2 minuty/minut, 3 sekundy/sekund
|
||||
// We'll extract numbers for h/m/s separately.
|
||||
var h, m, sec int
|
||||
|
||||
// English capture
|
||||
if mm := regexp.MustCompile(`(\d+)\s*hour`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
h, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
if mm := regexp.MustCompile(`(\d+)\s*minute`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
m, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
if mm := regexp.MustCompile(`(\d+)\s*second`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
sec, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
|
||||
// Czech capture
|
||||
if mm := regexp.MustCompile(`(\d+)\s*hodin(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
if h == 0 {
|
||||
h, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
}
|
||||
if mm := regexp.MustCompile(`(\d+)\s*minut(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
if m == 0 {
|
||||
m, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
}
|
||||
if mm := regexp.MustCompile(`(\d+)\s*sekund(?:a|y)?`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
if sec == 0 {
|
||||
sec, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
}
|
||||
|
||||
// If we still didn't parse anything but string contains a plain number like "5 minutes",
|
||||
// ensure we at least capture minutes.
|
||||
if h == 0 && m == 0 && sec == 0 {
|
||||
if mm := regexp.MustCompile(`^(\d+)$`).FindStringSubmatch(t); len(mm) >= 2 {
|
||||
m, _ = strconv.Atoi(mm[1])
|
||||
}
|
||||
}
|
||||
|
||||
// Build the time string
|
||||
if h > 0 {
|
||||
return fmt.Sprintf("%d:%02d:%02d", h, m, sec)
|
||||
}
|
||||
if m > 0 || sec > 0 {
|
||||
return fmt.Sprintf("%d:%02d", m, sec)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseCountText handles strings like "1,234 views", "12K subscribers", "3.4M"
|
||||
func parseCountText(s string) int64 {
|
||||
t := strings.ToLower(strings.TrimSpace(s))
|
||||
// keep only the first number token
|
||||
re := regexp.MustCompile(`([0-9]+(?:\.[0-9]+)?)([kmb])?`)
|
||||
if m := re.FindStringSubmatch(t); len(m) >= 2 {
|
||||
numStr := m[1]
|
||||
suf := ""
|
||||
if len(m) >= 3 {
|
||||
suf = m[2]
|
||||
}
|
||||
f, err := strconv.ParseFloat(numStr, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
switch suf {
|
||||
case "k":
|
||||
f *= 1_000
|
||||
case "m":
|
||||
f *= 1_000_000
|
||||
case "b":
|
||||
f *= 1_000_000_000
|
||||
}
|
||||
return int64(f)
|
||||
}
|
||||
// Fallback: strip non-digits and parse
|
||||
digits := regexp.MustCompile(`[^0-9]`).ReplaceAllString(t, "")
|
||||
if digits == "" {
|
||||
return 0
|
||||
}
|
||||
v, _ := strconv.ParseInt(digits, 10, 64)
|
||||
return v
|
||||
}
|
||||
|
||||
func channelVideosHandler(w http.ResponseWriter, r *http.Request) {
|
||||
channel := r.URL.Query().Get("channel")
|
||||
if channel == "" {
|
||||
log.Println("Missing channel parameter")
|
||||
http.Error(w, "Missing channel parameter. Provide a handle like @FCBizoniUH, FCBBizoniUH, or a full channel URL.", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
res, err := fetchChannelVideos(channel)
|
||||
if err != nil {
|
||||
log.Printf("Failed to fetch channel videos for %s: %v", channel, err)
|
||||
http.Error(w, "Failed to fetch channel videos", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
json.NewEncoder(w).Encode(res)
|
||||
}
|
||||
|
||||
// CORS Middleware
|
||||
func corsMiddleware(next http.Handler) http.Handler {
|
||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Set CORS headers
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
|
||||
w.Header().Set("Access-Control-Allow-Headers", "Content-Type")
|
||||
|
||||
// Handle preflight requests
|
||||
if r.Method == http.MethodOptions {
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
|
||||
next.ServeHTTP(w, r)
|
||||
})
|
||||
}
|
||||
|
||||
func rootHandler(w http.ResponseWriter, r *http.Request) {
|
||||
if r.URL.Path != "/" {
|
||||
http.NotFound(w, r)
|
||||
return
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
response := map[string]interface{}{
|
||||
"status": "ok",
|
||||
"service": "YouTube Scraper",
|
||||
"version": "1.0.0",
|
||||
"endpoints": map[string]string{
|
||||
"channel_videos": "/channel_videos?channel={handle_or_url}",
|
||||
},
|
||||
}
|
||||
json.NewEncoder(w).Encode(response)
|
||||
}
|
||||
|
||||
func main() {
|
||||
port := os.Getenv("PORT")
|
||||
if port == "" {
|
||||
port = "7857"
|
||||
}
|
||||
|
||||
mux := http.NewServeMux()
|
||||
|
||||
// Create a new mux with CORS middleware
|
||||
handlerWithCORS := corsMiddleware(mux)
|
||||
|
||||
// Register routes on the original mux
|
||||
mux.HandleFunc("/", rootHandler)
|
||||
mux.HandleFunc("/channel_videos", channelVideosHandler)
|
||||
|
||||
log.Printf("YouTube Scraper starting on port %s", port)
|
||||
log.Fatal(http.ListenAndServe(":"+port, handlerWithCORS))
|
||||
}
|
||||
Reference in New Issue
Block a user