From 2d74cce367b2e4cba6d60aa41bb2af64f437c650 Mon Sep 17 00:00:00 2001 From: lolcat Date: Wed, 18 Jun 2025 10:30:31 -0400 Subject: [PATCH] fix yandex web --- src/scraper/yandex.php | 121 ++++++++++++++++++++++++++++++++++------- 1 file changed, 100 insertions(+), 21 deletions(-) diff --git a/src/scraper/yandex.php b/src/scraper/yandex.php index 853480a..8646aed 100644 --- a/src/scraper/yandex.php +++ b/src/scraper/yandex.php @@ -13,31 +13,67 @@ class yandex{ include "lib/backend.php"; // backend included in the scraper functions } - - private function get($proxy, $url, $get = [], $nsfw){ - + + private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){ + $curlproc = curl_init(); - + if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; } - + curl_setopt($curlproc, CURLOPT_URL, $url); - + + // extract "i" cookie + if($get_cookie === 0){ + + $cookies_tmp = []; + curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ + + $length = strlen($header); + + $header = explode(":", $header, 2); + + if(trim(strtolower($header[0])) == "set-cookie"){ + + $cookie_tmp = explode("=", trim($header[1]), 2); + + $cookies_tmp[trim($cookie_tmp[0])] = + explode(";", $cookie_tmp[1], 2)[0]; + } + + return $length; + }); + } + switch($nsfw){ case "yes": $nsfw = "0"; break; case "maybe": $nsfw = "1"; break; case "no": $nsfw = "2"; break; } - + + switch($get_cookie){ + + case 0: + $cookie = ""; + break; + + case 1: + $cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw; + break; + + default: + $cookie = "Cookie: i=" . $get_cookie; + } + $headers = ["User-Agent: " . config::USER_AGENT, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Encoding: gzip", "Accept-Language: en-US,en;q=0.5", "DNT: 1", - "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw, + $cookie, "Referer: https://yandex.com/images/search", "Connection: keep-alive", "Upgrade-Insecure-Requests: 1", @@ -56,11 +92,22 @@ class yandex{ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); $this->backend->assign_proxy($curlproc, $proxy); - + $data = curl_exec($curlproc); - + + if($get_cookie === 0){ + + if(isset($cookies_tmp["i"])){ + + return $cookies_tmp["i"]; + }else{ + + throw new Exception("Failed to get Yandex clearance cookie"); + } + } + if(curl_errno($curlproc)){ - + throw new Exception(curl_error($curlproc)); } @@ -216,9 +263,26 @@ class yandex{ // https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712 // &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023 - + + // get clearance cookie + if(($cookie = apcu_fetch("yandexweb_cookie")) === false){ + + $proxy = $this->backend->get_ip(); + + $cookie = + $this->get( + $proxy, + "https://yandex.ru/support2/smart-captcha/ru/", + [], + false, + 0 + ); + + apcu_store("yandexweb_cookie", $cookie); + } + if($get["npt"]){ - + [$npt, $proxy] = $this->backend->get($get["npt"], "web"); $html = @@ -226,17 +290,18 @@ class yandex{ $proxy, "https://yandex.com" . $npt, [], - "yes" + "yes", + $cookie ); }else{ - + $search = $get["s"]; if(strlen($search) === 0){ throw new Exception("Search term is empty!"); } - - $proxy = $this->backend->get_ip(); + + $proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy; $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; @@ -283,10 +348,11 @@ class yandex{ $proxy, "https://yandex.com/search/site/", $params, - "yes" + "yes", + $cookie ); }catch(Exception $error){ - + throw new Exception("Could not get search page"); } @@ -311,9 +377,22 @@ class yandex{ "news" => [], "related" => [] ]; - + $this->fuckhtml->load($html); - + + // Scrape page blocked error + $title = + $this->fuckhtml + ->getElementsByTagName("title"); + + if( + count($title) !== 0 && + $title[0]["innerHTML"] == "403" + ){ + + throw new Exception("Yandex blocked this proxy or 4get instance."); + } + // get nextpage $npt = $this->fuckhtml