Golang如何获取网站的所有页面
Golang如何获取网站的所有页面 我想创建一个服务,该服务从数组(inputData)中获取网站作为输入,通过 _website 键获取网站。扫描任意网站(无论是什么类型的网站,扫描整个站点,而不仅仅是首页),将每个扫描到的网站数据合并到数组中。扫描完所有网站后,在控制台上显示结果。
输入:
const inputData = [
{_website:['https://example1.com/']},
{_website:['https://example2.com']}
];
例如,输出必须像这样:
输出 : [
{
_website:['https://example1.com/'],
_link:['https://example1.com/about'],
_statusCode:[200],
} ,
{
_website:[],
_link:['https://example1.com/blog'],
_statusCode:[200],
},
{
_website:[],
_link:['https://example1.com/shop'],
_statusCode:[200],
},
//...
{
_website:['https://example2.com/'],
_link:['https://example1.com/about'],
_statusCode:[200],
} ,
{
_website:[],
_link:['https://example2.com/brand'],
_statusCode:[200],
} ,
{
_website:[],
_link:['https://example2.com/blog'],
_statusCode:[200],
} ,
//...
]
更多关于Golang如何获取网站的所有页面的实战教程也可以访问 https://www.itying.com/category-94-b0.html
2 回复
更多关于Golang如何获取网站的所有页面的实战系列教程也可以访问 https://www.itying.com/category-94-b0.html
要实现网站全站扫描,可以使用Go的net/http包配合HTML解析器来提取链接。以下是完整的解决方案:
package main
import (
"fmt"
"net/http"
"strings"
"sync"
"time"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
type PageData struct {
Website []string `json:"_website"`
Link []string `json:"_link"`
StatusCode []int `json:"_statusCode"`
}
type InputItem struct {
Website []string `json:"_website"`
}
var (
visitedURLs = make(map[string]bool)
mu sync.RWMutex
client = &http.Client{
Timeout: 10 * time.Second,
}
)
func extractLinks(baseURL string, body string) ([]string, error) {
doc, err := html.Parse(strings.NewReader(body))
if err != nil {
return nil, err
}
var links []string
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.DataAtom == atom.A {
for _, attr := range n.Attr {
if attr.Key == "href" {
link := attr.Val
if strings.HasPrefix(link, "/") {
link = baseURL + link
} else if strings.HasPrefix(link, "http") {
// 确保是同域链接
if strings.HasPrefix(link, baseURL) {
links = append(links, link)
}
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return links, nil
}
func crawlWebsite(website string, results chan<- PageData, wg *sync.WaitGroup) {
defer wg.Done()
queue := make(chan string, 100)
queue <- website
for url := range queue {
mu.RLock()
if visitedURLs[url] {
mu.RUnlock()
continue
}
mu.RUnlock()
mu.Lock()
visitedURLs[url] = true
mu.Unlock()
resp, err := client.Get(url)
if err != nil {
continue
}
defer resp.Body.Close()
// 读取响应体
buf := new(strings.Builder)
_, _ = buf.ReadFrom(resp.Body)
body := buf.String()
// 创建结果条目
result := PageData{
Website: []string{website},
Link: []string{url},
StatusCode: []int{resp.StatusCode},
}
results <- result
// 只处理HTML页面
if strings.Contains(resp.Header.Get("Content-Type"), "text/html") {
links, err := extractLinks(website, body)
if err == nil {
for _, link := range links {
mu.RLock()
if !visitedURLs[link] {
mu.RUnlock()
select {
case queue <- link:
default:
}
} else {
mu.RUnlock()
}
}
}
}
}
close(queue)
}
func main() {
inputData := []InputItem{
{Website: []string{"https://example1.com/"}},
{Website: []string{"https://example2.com"}},
}
results := make(chan PageData, 100)
var wg sync.WaitGroup
var allResults []PageData
// 收集结果
go func() {
for result := range results {
allResults = append(allResults, result)
}
}()
// 启动爬虫
for _, item := range inputData {
for _, website := range item.Website {
wg.Add(1)
go crawlWebsite(website, results, &wg)
}
}
wg.Wait()
close(results)
// 显示结果
fmt.Printf("输出: [\n")
for i, result := range allResults {
if i > 0 {
fmt.Printf(",\n")
}
fmt.Printf(" {\n")
fmt.Printf(" _website:%v,\n", result.Website)
fmt.Printf(" _link:%v,\n", result.Link)
fmt.Printf(" _statusCode:%v,\n", result.StatusCode)
fmt.Printf(" }")
}
fmt.Printf("\n]\n")
}
如果需要处理JavaScript渲染的页面,可以使用chromedp:
package main
import (
"context"
"fmt"
"time"
"github.com/chromedp/chromedp"
)
func crawlWithChrome(website string) ([]PageData, error) {
ctx, cancel := chromedp.NewContext(context.Background())
defer cancel()
ctx, cancel = context.WithTimeout(ctx, 30*time.Second)
defer cancel()
var results []PageData
visited := make(map[string]bool)
var crawl func(string) error
crawl = func(url string) error {
if visited[url] {
return nil
}
visited[url] = true
var htmlContent string
err := chromedp.Run(ctx,
chromedp.Navigate(url),
chromedp.WaitReady("body"),
chromedp.OuterHTML("html", &htmlContent),
)
if err != nil {
return err
}
// 提取链接并递归爬取
// 这里需要实现链接提取逻辑
results = append(results, PageData{
Website: []string{website},
Link: []string{url},
StatusCode: []int{200},
})
return nil
}
return results, crawl(website)
}
运行前需要安装依赖:
go get golang.org/x/net/html
go get golang.org/x/net/html/atom
# 如果需要JavaScript支持
go get github.com/chromedp/chromedp
这个解决方案会:
- 并发爬取多个网站
- 避免重复访问相同URL
- 只爬取同域名下的链接
- 记录HTTP状态码
- 输出符合要求的格式

