|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156 |
- package test
-
- import (
- "fmt"
- "github.com/gocolly/colly"
- "github.com/gocolly/colly/extensions"
- "github.com/tidwall/gjson"
- "net/http"
- "regexp"
- "strings"
- )
-
-
-
-
- type Response struct {
- Msg string `json:"msg"`
- Success int `json:"success"`
- Data interface{} `json:"data"`
- }
-
- func main() {
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- for i := 0; i < 1000; i++ {
- fmt.Println(i)
- scrapPDD()
- }
- }
-
- func scrapJD() {
- c := colly.NewCollector(func(collector *colly.Collector) {
- extensions.RandomUserAgent(collector)
- })
- c.OnResponse(func(r *colly.Response) {
- re, _ := regexp.Compile(`[(]//[^\s]*[)]`)
- body := r.Body
- fmt.Println(string(body))
- urls := re.FindAllString(string(body), -1)
- fmt.Println(urls)
- for _, url := range urls {
- url = strip(url, "()")
- url = "https:" + url
- fmt.Println(url)
- }
- })
- c.Visit("https://wqsitem.jd.com/detail/100008309360_d100008309360_normal.html")
- }
-
- func scrapPDD() {
- var cookies = []*http.Cookie{}
- var mapcookies = make(map[string]string)
- url := fmt.Sprintf("https://mobile.yangkeduo.com/goods.html?goods_id=%s", "156632692649")
- cs := "api_uid=CiHUKl9DZKpL6QBVK4qWAg==; _nano_fp=Xpdbl0PyX5Pxn0TynT_DTGXbst0kz5cjzGAQDnBR; ua=Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F84.0.4147.135%20Safari%2F537.36; webp=1; quick_entrance_click_record=20200824%2C1; PDDAccessToken=XRC6FNX7FRBL6AJRMRBRN4CDG2PZXO3YJZYHFUA4O2PLDAWVYXHA1125821; pdd_user_id=9622705741400; pdd_user_uin=F27EAZ4V5S7EGEVMCJI2P7RFLE_GEXDA; chat_config={'host_whitelist':['.yangkeduo.com','.pinduoduo.com','.10010.com/queen/tencent/pinduoduo-fill.html','.ha.10086.cn/pay/card-sale!toforward.action','wap.ha.10086.cn','m.10010.com']}; pdd_vds=gaLMNqmfGfyYEpyYiZGWopaCicNHbXGWtDNcOZnWLqiDNfLHOXnZaqtCLDiX"
- csList := strings.Split(cs, ";")
- for _, c := range csList {
- s := strings.Trim(c, " ")
- sList := strings.SplitN(s, "=", 2)
-
- mapcookies[sList[len(sList)-len(sList)]] = sList[(len(sList) - len(sList) + 1)]
-
- }
- fmt.Println(mapcookies)
- for key, value := range mapcookies {
- if key == "ua" {
- continue
- }
- cookies = append(cookies, &http.Cookie{Name: key, Value: value})
- }
- c := colly.NewCollector(
- colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"),
- )
-
- c.OnResponse(func(r *colly.Response) {
- re, _ := regexp.Compile(`window.rawData=.*}`)
- body := r.Body
- fmt.Println(string(body))
- result := re.FindString(string(body))
-
- result = strings.SplitN(result, "=", 2)[1]
-
- value := gjson.Get(result, "store.initDataObj.goods.detailGallery")
-
- list := value.Array()
- imageList := []string{}
- for _, v := range list {
- nv := gjson.Get(v.String(), "url")
- imageList = append(imageList, nv.String())
- }
- fmt.Println(imageList)
- ck := c.Cookies("https://mobile.yangkeduo.com")
- fmt.Println(ck)
- cookies = ck
- })
-
- c.SetCookies("https://mobile.yangkeduo.com", cookies)
-
- c.Visit(url)
- }
-
- func strip(ss string, charss string) string {
- s, chars := []rune(ss), []rune(charss)
- length := len(s)
- max := len(s) - 1
- l, r := true, true
- start, end := 0, max
- tmpEnd := 0
- charset := make(map[rune]bool)
- for i := 0; i < len(chars); i++ {
- charset[chars[i]] = true
- }
- for i := 0; i < length; i++ {
- if _, exist := charset[s[i]]; l && !exist {
- start = i
- l = false
- }
- tmpEnd = max - i
- if _, exist := charset[s[tmpEnd]]; r && !exist {
- end = tmpEnd
- r = false
- }
- if !l && !r {
- break
- }
- }
- if l && r {
- return ""
- }
- return string(s[start : end+1])
- }
|