Clojure脚本处理yandex图片搜索和Youtube视频下载地址

Yandex 图片搜索

论搜索能力, Yandex不再Google之下, 有时候比Google的结果要好, 下面的Clojure脚本将会抓取Yandex图片搜索的结果并提取所有图片的图片地址, 来源网址等信息, 这样可以在同一个界面中以原始大小显示所有图片.

 
 
(defn make-literal [a]
  (.replace a "\"" "\\\"")
)
 
(defn page-prototyping [url selector func w]
  (let [dumb (.setCurrentNode w (.getRoot w))
        ie (prototype.InfoExtracter.)        
        elements (.FindElements ie selector w url)
       ]
    (map #(func %) elements)
  )
)
 
(defn downlad-and-parse-proxy [url pairs]
  (let [content (prototype.Util/DownloadPage url)
        dbuilder (prototype.DOMBuilder.)
        w (.GetWalkerByDoc dbuilder (.GetDoc dbuilder content))
        dumbdebug (print "finished downloading and parsing")        
       ]
    (mapv #(page-prototyping url (first %) (second %) w) pairs)
  )
)
 
(defn extract-anything-between [prefix subfix target]
  (let [pattern (str (make-literal prefix) "([\\s\\S]*?)" (make-literal subfix))]
    (second (re-find (re-pattern pattern) target))
  )
)
 
(defn extract-yandex-image [onmouse]
  (extract-anything-between "\"href\":\""  "\"}" onmouse)
)
 
(defn extract-yandex-image-title [onmouse]
  (let [onmouse-replaced (clojure.string/replace onmouse #"^[\s\S]*?\{\"title\":" "{\"title\":")]
    [
     (extract-anything-between "\"title\":\""  "\"," onmouse-replaced)
     (extract-anything-between "\"text\":\""  "\"," onmouse-replaced)
     (extract-anything-between "\"url\":\""  "\"," onmouse-replaced)
    ]
  )
)
 
(defn handle-yandex-image [element]
  (extract-yandex-image (.getAttribute element "onmousedown"))
)
 
(defn handle-yandex-image-title [element]
  (extract-yandex-image-title (.getAttribute element "data-bem"))
)
 
(defn yandex-getimagelist [query]
  (transpose (downlad-and-parse-proxy (construct-proxy-url-yandex-image query)
    [["a.serp-item__link {}" handle-yandex-image] ["div.serp-item {}" handle-yandex-image-title]]
  ))
)
 
(defn image-entry-to-html [title title-text title-url image-url]
  (str "<br/>" (str "<a href=\"" title-url "\"><h2>" title "</h2></a>")
       "<br/>" "<div>" title-text  "</div>"
       "<br/>" "<img style=\"width:100%\" src=\"" image-url  "\"/>"
  )
)
 
(defn yandex-image-result-to-html [result]
  (join "</br></br>" (map #(image-entry-to-html
    (get (second %) 0)
    (get (second %) 1)
    (proxy-encodeurl (get (second %) 2))
    (proxy-encodeurl (first %) )
  ) result))
)
 
; usage
; (yandex-image-result-to-html (yandex-getimagelist query))
 

几点说明: 用到了CSS解析库和JAVA网页下载器, 可以在我的github上找到这个工具. proxy-encodeurl 将url变成代理地址, 这个换成自己的网页代理地址.

Youtube视频文件地址

Youtube提供了视频信息查询的接口, 只要知道id, 就可以访问下面的地址

 
(str "http://youtube.com/get_video_info?video_id=" id )
 

将返回所有可用的格式极其下载地址.

 
(defn construct-proxy-url-youtube [id]
  (proxy-encodeurl (str "http://youtube.com/get_video_info?video_id=" id ))
)
 
(defn parse_str [s]
  (apply merge 
    (map #(hash-map (keyword (first %)) (second %)) 
      (map #(.split % "=")
        (.split s "&")
      )
    )
  )
)
 
(defn parse-youtube-info [id]
  (parse_str (prototype.Util/DownloadPage (construct-proxy-url-youtube id)))
)
 
(defn youtube-stream-handle [stream]
  (let [parsed-stream (parse_str stream)]
    (pprint (assoc parsed-stream :url (java.net.URLDecoder/decode (:url parsed-stream) "UTF-8")))
  )
)
 
(defn get-youtube-video [parsed]
  (let [url_encoded_fmt_stream_map (:url_encoded_fmt_stream_map parsed)
        adaptive_fmts (:adaptive_fmts parsed)
        ;stream (parse_str (java.net.URLDecoder/decode (first (.split url_encoded_fmt_stream_map  ",")) "UTF-8") )
        splited-stream (.split (java.net.URLDecoder/decode url_encoded_fmt_stream_map "UTF-8") ",")
        splited-adaptive_fmts (.split (java.net.URLDecoder/decode adaptive_fmts "UTF-8") ",")
       ]
    (doall (map #(youtube-stream-handle %) splited-stream))
    (print "===============adaptive_fmts starts\n")
    (map #(youtube-stream-handle %) splited-adaptive_fmts)
  )
)
 
 

输出例子

 
user> (get-youtube-video foo)
{:type "video%2Fmp4%3B+codecs%3D%22avc1.64001F%2C+mp4a.40.2%22",
 
 :url
 "http://r2---sn-i3b7kn7r.googlevideo.com/videoplayback?mm=31&mn=sn-i3b7kn7r&fexp=9408710%2C9416126%2C9417683%2C9418400%2C9419838%2C9420452%2C9422324%2C9422578%2C9422596%2C9422618%2C9422967%2C9423419%2C9423662%2C9424307%2C9424477%2C9424645%2C9424698%2C9424812%2C9424963&key=yt6&upn=_1ydvfgEdIU&sparams=dur%2Cid%2Cip%2Cipbits%2Citag%2Clmt%2Cmime%2Cmm%2Cmn%2Cms%2Cmv%2Cnh%2Cpl%2Cratebypass%2Csource%2Cupn%2Cexpire&ip=117.18.8.1&itag=22&ms=au&source=youtube&ratebypass=yes&mv=u&dur=2401.593&lmt=1433483717982658&mime=video%2Fmp4&signature=52C54356DA02E5A5FCCED4D75DDDD6D9A6724ECD.4A7DD2AE81B5884974044F3DAAC7B1CDFA62516A&pl=24&ipbits=0&nh=IgpwcjAxLmhrZzA4KgkxMjcuMC4wLjE&mt=1448460311&id=o-AIqtMdrOWt9rM2I_o9ZMlFjg_DyV3JqWcrFP4llaJ6cJ&sver=3&expire=1448482581",
 :itag "22",
 :quality "hd720",
 :fallback_host "tc.v6.cache2.googlevideo.com"}
{:type "video%2Fwebm%3B+codecs%3D%22vp8.0%2C+vorbis%22",
 :url
 "http://r2---sn-i3b7kn7r.googlevideo.com/videoplayback?mm=31&mn=sn-i3b7kn7r&fexp=9408710%2C9416126%2C9417683%2C9418400%2C9419838%2C9420452%2C9422324%2C9422578%2C9422596%2C9422618%2C9422967%2C9423419%2C9423662%2C9424307%2C9424477%2C9424645%2C9424698%2C9424812%2C9424963&key=yt6&upn=_1ydvfgEdIU&sparams=dur%2Cid%2Cip%2Cipbits%2Citag%2Clmt%2Cmime%2Cmm%2Cmn%2Cms%2Cmv%2Cnh%2Cpl%2Cratebypass%2Csource%2Cupn%2Cexpire&ip=117.18.8.1&itag=43&ms=au&source=youtube&ratebypass=yes&mv=u&dur=0.000&lmt=1433360043844226&mime=video%2Fwebm&signature=2D7998CE7092CB905E02D470A3892198AF5EB611.07D14CBC9CC1CB8C0C22B58C563F0A0721AF3782&pl=24&ipbits=0&nh=IgpwcjAxLmhrZzA4KgkxMjcuMC4wLjE&mt=1448460311&id=o-AIqtMdrOWt9rM2I_o9ZMlFjg_DyV3JqWcrFP4llaJ6cJ&sver=3&expire=1448482581",
 :itag "43",
 :quality "medium",
 :fallback_host "tc.v17.cache5.googlevideo.com"}
 
.....