Yandex图片搜索格式更改

Clojure抓取Yandex图搜索结果的脚本突然失效了, 原因是Yandex所返回的HTML的格式修改了, Yandex修改输出格式相对频繁, Google几乎不怎么变.

原来的格式

(defn extract-yandex-image-url [encoded-url]
  (extract-anything-between "\"img_href\":\"" "\",\"" (urldecode (decodeurl encoded-url)))  
)
 
(defn extract-yandex-image [data-bem]
  [
    (extract-anything-between "\",\"url\":\"" "\",\"domain\"" data-bem) ; get ref url
    (extract-yandex-image-url (extract-anything-between "img_url=" " id=" data-bem)) ; img url
  ]
)

Yandex新的格式为

{"serp-item":{"reqid":"1457666891531988-1097369245266431447211320-man1-3465-I
MG","freshness":"normal","preview":[{"url":"http://images.equipboard.com/uplo
ads/item/image/10331/filco-minila-air-xl.jpg?v=1451026057","width":600,"heigh
t":458},{"url":"http://images.equipboard.com/uploads/item/image/10331/filco-m
inila-air-xl.jpg?v=1451026057","width":600,"height":458}],"dups":[{"url":"htt
p://images.equipboard.com/uploads/item/image/10331/filco-minila-air-xl.jpg?v=
1451026057","width":600,"height":458}],"thumb":{"url":"//im2-tub-com.yandex.n
et/i?id=d8eec736e18a0833981dd1b3909ac1d3&n=33&h=215&w=282","size":{"width":28
2,"height":215}},"snippet":{"title":"Filco Minila Air Reviews & Prices Equipb
oard","hasTitle":true,"text":"<b>Filco</b> <b>Minila</b> <b>Air</b>.","url":"
http://equipboard.com/items/filco-minila-air","domain":"equipboard.com","redi
rUrl":"http://www.yandex.com/clck/jsredir?from=www.yandex.com%3Bimages%2Fsear
ch%3Bimages%3B%3B&text=&etext=990.siSWeGQUhNSy7c10u3oQyYTPcQlnB47iFSpQGJggryW
cAAkqfs75NR5Ujv8xub9Z.6da37fffb6b6227a664a008d9dd4c3812b88f401&uuid=&state=ti
d_Wvm4RM28ca_MiO4Ne9osTPtpHS9wicjEF5X7fRziVPIHCd9FyQ&data=UlNrNmk5WktYejR0eWJ
FYk1Ldmtxck55N1JmbWVPSVpNczZiS1puTjZWajN4a2ZibFptWnBUNmlndmllREtMQmFWWFMwRDB2
SWUyV2NjYUxGWFdYN0tvaUZ5OGFDWGJDanBweTFId1lDcVJFM0JzQkV5VjdEYlFub180ekZBQUw&b
64e=2&sign=7391aae18afffc14c5f8f8700cdeb95a&keyno=0&l10n=en"},"detail_url":"/
images/search?text=filco%20minila%20air&img_url=http%3A%2F%2Frsc2.saatscommer
ce.com%2Fimg535b2b211068f_l.jpg&pos=0&rpt=simage","img_href":"http://images.e
quipboard.com/uploads/item/image/10331/filco-minila-air-xl.jpg?v=1451026057",
"useProxy":false,"pos":0,"id":"5c34f72b5aaa795599d1d421ab6c5ca8","timeQuery":
"1457666891","counterPath":"thumb/normal"}}

图片地址位于img_href和useProxy之间, 因此新的提取方法应该是

(extract-anything-between "\",\"img_href\":\"" "\",\"useProxy\"" data-bem)

修改如下

(defn extract-yandex-image-url [encoded-url]
  (if (nil? encoded-url) (println "ERROR:in extract-yandex-image-url, accepting nil"))
  encoded-url ; in version 3, we get url directly
)
 
(defn extract-yandex-image [data-bem]
  (let [
         img_url (extract-anything-between "\",\"img_href\":\"" "\",\"useProxy\"" data-bem);version 3 
       ]
    ;; if img_url is nil, something is wrong
    (if (nil? img_url) (println "ERROR:img_url from data bem is nil, dump of data-bem: " data-bem))
    [
    (extract-anything-between "\",\"url\":\"" "\",\"domain\"" data-bem) ; get ref url
    (extract-yandex-image-url img_url) ; img url
    ]
  )
)