aboutsummaryrefslogtreecommitdiffstats
path: root/scraper
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-06-18 10:30:31 -0400
committerlolcat <will@lolcat.ca>2025-06-18 10:30:31 -0400
commitf73b5f0298f06b44c5cd8a84e327b8e1d7d4ea95 (patch)
tree787571f017246a74bef862d70aecf1a3612c48cb /scraper
parent3e1487e614f3bb5d86ddda6da63a39a8cdaadf15 (diff)
fix yandex web
Diffstat (limited to 'scraper')
-rw-r--r--scraper/yandex.php89
1 files changed, 84 insertions, 5 deletions
diff --git a/scraper/yandex.php b/scraper/yandex.php
index 7af8781..f73c3fd 100644
--- a/scraper/yandex.php
+++ b/scraper/yandex.php
@@ -14,7 +14,7 @@ class yandex{
// backend included in the scraper functions
}
- private function get($proxy, $url, $get = [], $nsfw){
+ private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){
$curlproc = curl_init();
@@ -25,19 +25,55 @@ class yandex{
curl_setopt($curlproc, CURLOPT_URL, $url);
+ // extract "i" cookie
+ if($get_cookie === 0){
+
+ $cookies_tmp = [];
+ curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
+
+ $length = strlen($header);
+
+ $header = explode(":", $header, 2);
+
+ if(trim(strtolower($header[0])) == "set-cookie"){
+
+ $cookie_tmp = explode("=", trim($header[1]), 2);
+
+ $cookies_tmp[trim($cookie_tmp[0])] =
+ explode(";", $cookie_tmp[1], 2)[0];
+ }
+
+ return $length;
+ });
+ }
+
switch($nsfw){
case "yes": $nsfw = "0"; break;
case "maybe": $nsfw = "1"; break;
case "no": $nsfw = "2"; break;
}
+ switch($get_cookie){
+
+ case 0:
+ $cookie = "";
+ break;
+
+ case 1:
+ $cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw;
+ break;
+
+ default:
+ $cookie = "Cookie: i=" . $get_cookie;
+ }
+
$headers =
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding: gzip",
"Accept-Language: en-US,en;q=0.5",
"DNT: 1",
- "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw,
+ $cookie,
"Referer: https://yandex.com/images/search",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
@@ -59,6 +95,17 @@ class yandex{
$data = curl_exec($curlproc);
+ if($get_cookie === 0){
+
+ if(isset($cookies_tmp["i"])){
+
+ return $cookies_tmp["i"];
+ }else{
+
+ throw new Exception("Failed to get Yandex clearance cookie");
+ }
+ }
+
if(curl_errno($curlproc)){
throw new Exception(curl_error($curlproc));
@@ -217,6 +264,23 @@ class yandex{
// https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712
// &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023
+ // get clearance cookie
+ if(($cookie = apcu_fetch("yandexweb_cookie")) === false){
+
+ $proxy = $this->backend->get_ip();
+
+ $cookie =
+ $this->get(
+ $proxy,
+ "https://yandex.ru/support2/smart-captcha/ru/",
+ [],
+ false,
+ 0
+ );
+
+ apcu_store("yandexweb_cookie", $cookie);
+ }
+
if($get["npt"]){
[$npt, $proxy] = $this->backend->get($get["npt"], "web");
@@ -226,7 +290,8 @@ class yandex{
$proxy,
"https://yandex.com" . $npt,
[],
- "yes"
+ "yes",
+ $cookie
);
}else{
@@ -236,7 +301,7 @@ class yandex{
throw new Exception("Search term is empty!");
}
- $proxy = $this->backend->get_ip();
+ $proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy;
$lang = $get["lang"];
$older = $get["older"];
$newer = $get["newer"];
@@ -283,7 +348,8 @@ class yandex{
$proxy,
"https://yandex.com/search/site/",
$params,
- "yes"
+ "yes",
+ $cookie
);
}catch(Exception $error){
@@ -314,6 +380,19 @@ class yandex{
$this->fuckhtml->load($html);
+ // Scrape page blocked error
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName("title");
+
+ if(
+ count($title) !== 0 &&
+ $title[0]["innerHTML"] == "403"
+ ){
+
+ throw new Exception("Yandex blocked this proxy or 4get instance.");
+ }
+
// get nextpage
$npt =
$this->fuckhtml
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage