广告平台(总站长使用)
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 
 

157 Zeilen
4.8 KiB

  1. package test
  2. import (
  3. "fmt"
  4. "github.com/gocolly/colly"
  5. "github.com/gocolly/colly/extensions"
  6. "github.com/tidwall/gjson"
  7. "net/http"
  8. "regexp"
  9. "strings"
  10. )
  11. /*
  12. 目前可用接口
  13. [商品查询]https://www.showdoc.com.cn/59349170678610?page_id=339616554551473
  14. [商品详情]https://www.showdoc.com.cn/59349170678610?page_id=339687047645094
  15. */
  16. // Response is SDK Response
  17. type Response struct {
  18. Msg string `json:"msg"`
  19. Success int `json:"success"`
  20. Data interface{} `json:"data"`
  21. }
  22. func main() {
  23. // // JD
  24. // postData := map[string]string{"keyword": "联想", "p": "1", "size": "10"}
  25. // fmt.Println(postData["time"])
  26. // res, _ := zhimeng.Send("jd", "getgoods", postData)
  27. // fmt.Println(string(res))
  28. // p := Response{}
  29. // json.Unmarshal(res, &p)
  30. // fmt.Println(p)
  31. // // VIP
  32. // postData = map[string]string{"keyword": "联想", "p": "1", "size": "10", "order": "0"}
  33. // fmt.Println(postData["time"])
  34. // res, _ = zhimeng.Send("wph", "seach_goods", postData)
  35. // fmt.Println(string(res))
  36. // p = Response{}
  37. // json.Unmarshal(res, &p)
  38. // fmt.Println(p)
  39. // // PDD
  40. // postData = map[string]string{"keyword": "联想", "p": "1", "size": "10", "sort": "goods_price asc"}
  41. // res, _ = zhimeng.Send("pdd", "getgoods", postData)
  42. // fmt.Println(string(res))
  43. // p = Response{}
  44. // json.Unmarshal(res, &p)
  45. // fmt.Println(p)
  46. for i := 0; i < 1000; i++ {
  47. fmt.Println(i)
  48. scrapPDD()
  49. }
  50. }
  51. func scrapJD() {
  52. c := colly.NewCollector(func(collector *colly.Collector) {
  53. extensions.RandomUserAgent(collector)
  54. })
  55. c.OnResponse(func(r *colly.Response) {
  56. re, _ := regexp.Compile(`[(]//[^\s]*[)]`)
  57. body := r.Body
  58. fmt.Println(string(body))
  59. urls := re.FindAllString(string(body), -1)
  60. fmt.Println(urls)
  61. for _, url := range urls {
  62. url = strip(url, "()")
  63. url = "https:" + url
  64. fmt.Println(url)
  65. }
  66. })
  67. c.Visit("https://wqsitem.jd.com/detail/100008309360_d100008309360_normal.html")
  68. }
  69. func scrapPDD() {
  70. var cookies = []*http.Cookie{}
  71. var mapcookies = make(map[string]string)
  72. url := fmt.Sprintf("https://mobile.yangkeduo.com/goods.html?goods_id=%s", "156632692649")
  73. cs := "api_uid=CiHUKl9DZKpL6QBVK4qWAg==; _nano_fp=Xpdbl0PyX5Pxn0TynT_DTGXbst0kz5cjzGAQDnBR; ua=Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F84.0.4147.135%20Safari%2F537.36; webp=1; quick_entrance_click_record=20200824%2C1; PDDAccessToken=XRC6FNX7FRBL6AJRMRBRN4CDG2PZXO3YJZYHFUA4O2PLDAWVYXHA1125821; pdd_user_id=9622705741400; pdd_user_uin=F27EAZ4V5S7EGEVMCJI2P7RFLE_GEXDA; chat_config={'host_whitelist':['.yangkeduo.com','.pinduoduo.com','.10010.com/queen/tencent/pinduoduo-fill.html','.ha.10086.cn/pay/card-sale!toforward.action','wap.ha.10086.cn','m.10010.com']}; pdd_vds=gaLMNqmfGfyYEpyYiZGWopaCicNHbXGWtDNcOZnWLqiDNfLHOXnZaqtCLDiX"
  74. csList := strings.Split(cs, ";")
  75. for _, c := range csList {
  76. s := strings.Trim(c, " ")
  77. sList := strings.SplitN(s, "=", 2)
  78. mapcookies[sList[len(sList)-len(sList)]] = sList[(len(sList) - len(sList) + 1)]
  79. }
  80. fmt.Println(mapcookies)
  81. for key, value := range mapcookies {
  82. if key == "ua" {
  83. continue
  84. }
  85. cookies = append(cookies, &http.Cookie{Name: key, Value: value})
  86. }
  87. c := colly.NewCollector(
  88. colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"),
  89. )
  90. c.OnResponse(func(r *colly.Response) {
  91. re, _ := regexp.Compile(`window.rawData=.*}`)
  92. body := r.Body
  93. fmt.Println(string(body))
  94. result := re.FindString(string(body))
  95. // fmt.Println(result)
  96. result = strings.SplitN(result, "=", 2)[1]
  97. // fmt.Println(result)
  98. value := gjson.Get(result, "store.initDataObj.goods.detailGallery")
  99. // fmt.Println(value)
  100. list := value.Array()
  101. imageList := []string{}
  102. for _, v := range list {
  103. nv := gjson.Get(v.String(), "url")
  104. imageList = append(imageList, nv.String())
  105. }
  106. fmt.Println(imageList)
  107. ck := c.Cookies("https://mobile.yangkeduo.com")
  108. fmt.Println(ck)
  109. cookies = ck
  110. })
  111. c.SetCookies("https://mobile.yangkeduo.com", cookies)
  112. c.Visit(url)
  113. }
  114. func strip(ss string, charss string) string {
  115. s, chars := []rune(ss), []rune(charss)
  116. length := len(s)
  117. max := len(s) - 1
  118. l, r := true, true //标记当左端或者右端找到正常字符后就停止继续寻找
  119. start, end := 0, max
  120. tmpEnd := 0
  121. charset := make(map[rune]bool) //创建字符集,也就是唯一的字符,方便后面判断是否存在
  122. for i := 0; i < len(chars); i++ {
  123. charset[chars[i]] = true
  124. }
  125. for i := 0; i < length; i++ {
  126. if _, exist := charset[s[i]]; l && !exist {
  127. start = i
  128. l = false
  129. }
  130. tmpEnd = max - i
  131. if _, exist := charset[s[tmpEnd]]; r && !exist {
  132. end = tmpEnd
  133. r = false
  134. }
  135. if !l && !r {
  136. break
  137. }
  138. }
  139. if l && r { // 如果左端和右端都没找到正常字符,那么表示该字符串没有正常字符
  140. return ""
  141. }
  142. return string(s[start : end+1])
  143. }