mirror of
https://github.com/Dvorinka/Trackeep.git
synced 2026-06-03 20:12:58 +00:00
first test
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
package models
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// ScrapedContent represents content extracted from web pages
|
||||
type ScrapedContent struct {
|
||||
ID uint `json:"id" gorm:"primaryKey"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
DeletedAt gorm.DeletedAt `json:"-" gorm:"index"`
|
||||
|
||||
UserID uint `json:"user_id" gorm:"not null;index"`
|
||||
User User `json:"user,omitempty" gorm:"foreignKey:UserID"`
|
||||
|
||||
// Source information
|
||||
URL string `json:"url" gorm:"not null"`
|
||||
Domain string `json:"domain"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Author string `json:"author"`
|
||||
PublishedDate *time.Time `json:"published_date"`
|
||||
LastScraped time.Time `json:"last_scraped"`
|
||||
|
||||
// Extracted content
|
||||
Content string `json:"content" gorm:"type:text"`
|
||||
Summary string `json:"summary" gorm:"type:text"`
|
||||
Keywords []string `json:"keywords" gorm:"serializer:json"`
|
||||
Tags []Tag `json:"tags,omitempty" gorm:"many2many:scraped_content_tags;"`
|
||||
Images []ScrapedImage `json:"images,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
Links []ScrapedLink `json:"links,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
Videos []ScrapedVideo `json:"videos,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
|
||||
// Content analysis
|
||||
ContentType string `json:"content_type"` // article, blog, news, tutorial, documentation
|
||||
WordCount int `json:"word_count"`
|
||||
ReadingTime int `json:"reading_time"` // estimated minutes
|
||||
Difficulty string `json:"difficulty"` // beginner, intermediate, advanced
|
||||
QualityScore float64 `json:"quality_score"` // 0-100
|
||||
|
||||
// Processing status
|
||||
Status string `json:"status" gorm:"default:pending"` // pending, processing, completed, failed
|
||||
ErrorMessage string `json:"error_message"`
|
||||
ProcessingLog string `json:"processing_log" gorm:"type:text"`
|
||||
|
||||
// Relationships
|
||||
BookmarkID *uint `json:"bookmark_id,omitempty"`
|
||||
Bookmark *Bookmark `json:"bookmark,omitempty" gorm:"foreignKey:BookmarkID"`
|
||||
NoteID *uint `json:"note_id,omitempty"`
|
||||
Note *Note `json:"note,omitempty" gorm:"foreignKey:NoteID"`
|
||||
}
|
||||
|
||||
// ScrapedImage represents images extracted from web pages
|
||||
type ScrapedImage struct {
|
||||
ID uint `json:"id" gorm:"primaryKey"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
DeletedAt gorm.DeletedAt `json:"-" gorm:"index"`
|
||||
|
||||
ScrapedContentID uint `json:"scraped_content_id" gorm:"not null;index"`
|
||||
ScrapedContent ScrapedContent `json:"scraped_content,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
|
||||
URL string `json:"url"`
|
||||
AltText string `json:"alt_text"`
|
||||
Title string `json:"title"`
|
||||
Width int `json:"width"`
|
||||
Height int `json:"height"`
|
||||
Format string `json:"format"` // jpg, png, gif, svg, webp
|
||||
Size int64 `json:"size"` // bytes in bytes
|
||||
IsMainImage bool `json:"is_main_image" gorm:"default:false"`
|
||||
LocalPath string `json:"local_path"` // if downloaded
|
||||
ThumbnailPath string `json:"thumbnail_path"` // if thumbnail generated
|
||||
}
|
||||
|
||||
// ScrapedLink represents links extracted from web pages
|
||||
type ScrapedLink struct {
|
||||
ID uint `json:"id" gorm:"primaryKey"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
DeletedAt gorm.DeletedAt `json:"-" gorm:"index"`
|
||||
|
||||
ScrapedContentID uint `json:"scraped_content_id" gorm:"not null;index"`
|
||||
ScrapedContent ScrapedContent `json:"scraped_content,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
|
||||
URL string `json:"url"`
|
||||
Text string `json:"text"`
|
||||
Title string `json:"title"`
|
||||
LinkType string `json:"link_type"` // internal, external, download, email
|
||||
IsNoFollow bool `json:"is_no_follow"`
|
||||
IsSponsored bool `json:"is_sponsored"`
|
||||
Domain string `json:"domain"`
|
||||
}
|
||||
|
||||
// ScrapedVideo represents videos extracted from web pages
|
||||
type ScrapedVideo struct {
|
||||
ID uint `json:"id" gorm:"primaryKey"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
DeletedAt gorm.DeletedAt `json:"-" gorm:"index"`
|
||||
|
||||
ScrapedContentID uint `json:"scraped_content_id" gorm:"not null;index"`
|
||||
ScrapedContent ScrapedContent `json:"scraped_content,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
|
||||
URL string `json:"url"`
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Duration string `json:"duration"` // in format "HH:MM:SS"
|
||||
Thumbnail string `json:"thumbnail"`
|
||||
Platform string `json:"platform"` // youtube, vimeo, twitch, etc.
|
||||
VideoID string `json:"video_id"` // platform-specific ID
|
||||
IsEmbeddable bool `json:"is_embeddable"`
|
||||
}
|
||||
|
||||
// ScrapingJob represents a web scraping job
|
||||
type ScrapingJob struct {
|
||||
ID uint `json:"id" gorm:"primaryKey"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
DeletedAt gorm.DeletedAt `json:"-" gorm:"index"`
|
||||
|
||||
UserID uint `json:"user_id" gorm:"not null;index"`
|
||||
User User `json:"user,omitempty" gorm:"foreignKey:UserID"`
|
||||
|
||||
// Job details
|
||||
URL string `json:"url" gorm:"not null"`
|
||||
JobType string `json:"job_type" gorm:"default:full_scrape"` // full_scrape, content_only, images_only, links_only
|
||||
Priority string `json:"priority" gorm:"default:normal"` // low, normal, high, urgent
|
||||
Status string `json:"status" gorm:"default:pending"` // pending, processing, completed, failed, cancelled
|
||||
|
||||
// Processing options
|
||||
ExtractImages bool `json:"extract_images" gorm:"default:true"`
|
||||
ExtractLinks bool `json:"extract_links" gorm:"default:true"`
|
||||
ExtractVideos bool `json:"extract_videos" gorm:"default:true"`
|
||||
GenerateSummary bool `json:"generate_summary" gorm:"default:true"`
|
||||
DownloadImages bool `json:"download_images" gorm:"default:false"`
|
||||
ExtractMetadata bool `json:"extract_metadata" gorm:"default:true"`
|
||||
|
||||
// Timing and results
|
||||
StartedAt *time.Time `json:"started_at,omitempty"`
|
||||
CompletedAt *time.Time `json:"completed_at,omitempty"`
|
||||
Progress float64 `json:"progress" gorm:"default:0"` // 0-100
|
||||
ErrorMessage string `json:"error_message"`
|
||||
|
||||
// Relationships
|
||||
ScrapedContentID *uint `json:"scraped_content_id,omitempty"`
|
||||
ScrapedContent *ScrapedContent `json:"scraped_content,omitempty" gorm:"foreignKey:ScrapedContentID"`
|
||||
}
|
||||
|
||||
// BeforeCreate hooks
|
||||
func (s *ScrapedContent) BeforeCreate(tx *gorm.DB) error {
|
||||
if s.Status == "" {
|
||||
s.Status = "pending"
|
||||
}
|
||||
if s.LastScraped.IsZero() {
|
||||
s.LastScraped = time.Now()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (j *ScrapingJob) BeforeCreate(tx *gorm.DB) error {
|
||||
if j.Status == "" {
|
||||
j.Status = "pending"
|
||||
}
|
||||
if j.Priority == "" {
|
||||
j.Priority = "normal"
|
||||
}
|
||||
if j.JobType == "" {
|
||||
j.JobType = "full_scrape"
|
||||
}
|
||||
return nil
|
||||
}
|
||||
Reference in New Issue
Block a user