From c69abf41b0efcd5c94530d4ed9a31d0c55054435 Mon Sep 17 00:00:00 2001 From: lolcat Date: Thu, 11 Sep 2025 02:05:21 -0400 Subject: google scraper fix --- scraper/google.php | 3058 +++++++++++++++++++++------------------------------- 1 file changed, 1232 insertions(+), 1826 deletions(-) (limited to 'scraper') diff --git a/scraper/google.php b/scraper/google.php index b935a0e..c83b084 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -504,40 +504,12 @@ class google{ } } - private function get($proxy, $url, $get = [], $use_lynx = false){ + private function get($proxy, $url, $get = []){ $curlproc = curl_init(); - - if($use_lynx === false){ - - $headers = [ - "User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1", - "Priority: u=1", - "TE: trailers" - ]; - - // use http2 - curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); - }else{ - - $headers = [ - "Accept: text/html, text/plain, text/sgml, */*;q=0.01", - "Accept-Encoding: gzip, compress, bzip2", - "Accept-Language: en", - "User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8" - ]; - } + + // use http2 in all cases + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); if($get !== []){ $get = http_build_query($get); @@ -547,7 +519,21 @@ class google{ curl_setopt($curlproc, CURLOPT_URL, $url); curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding - curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=1", + "TE: trailers" + ]); curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); @@ -569,344 +555,496 @@ class google{ curl_close($curlproc); - if($use_lynx){ - - return mb_convert_encoding($data, "UTF-8", "ISO-8859-1"); - } - return $data; } - private function scrape_dimg($html){ - - // get images loaded through javascript - $this->dimg = []; - - preg_match_all( - '/function\(\){google\.ldi=({.*?});/', - $html, - $dimg - ); + public function web($get){ - if(isset($dimg[1])){ + if($get["npt"]){ - foreach($dimg[1] as $i){ + [$params, $proxy] = $this->backend->get($get["npt"], "web"); + + $params = json_decode($params, true); + + try{ + $json = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ - $tmp = json_decode($i, true); - foreach($tmp as $key => $value){ + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + $proxy = $this->backend->get_ip(); + + $offset = 0; + + /* + https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc + + https://www.google.com/search?udm=14& + yv=3& + q=asmr& + biw=1920& + bih=947& + start=0& + sa=N& + asearch=arc& + cs=1& + async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc + */ + + $params = [ + "udm" => 14, + "yv" => 3, + "q" => $search, + "biw" => 1920, + "bih" => 947, + "start" => 0, + "sa" => "N", + "asearch" => "arc", + "cs" => 1, + "async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc", + "hl" => "en", + "num" => 20 + ]; + + // country + if($country != "any"){ + + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ + + $params["lr"] = "lang_" . $lang; + } + + // generate tbs + $tbs = []; + + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); + + if( + $older !== null || + $newer !== null + ){ + + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // spellcheck filter + if($spellcheck == "no"){ + + $params["nfpr"] = "1"; + } + + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ - $this->dimg[$key] = - $this->unshit_thumb( - $value - ); + $params["tbs"] .= $key . ":" . $value . ","; } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + + try{ + $json = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); } + + //$json = file_get_contents("scraper/google.js"); } - // get additional javascript base64 images - preg_match_all( - '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', - $html, - $dimg - ); + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; - if(isset($dimg[1])){ + $this->fuckhtml->load($json); + $this->detect_sorry(); + + // get next page + /* + $npt = + $this->fuckhtml + ->getElementsByAttributeName( + "data-state-token", + "div" + ); + + if(count($npt) !== 0){ - for($i=0; $ifuckhtml + ->getTextContent( + $npt[0]["attributes"]["data-state-token"] + ); + + $params["start"] += 10; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); + }*/ + + // get invididual results + $results = + $this->fuckhtml + ->getElementsByAttributeName( + "data-hveid", + "div" + ); + + foreach($results as $result){ + + $this->fuckhtml->load($result); + + //echo $result["innerHTML"]; + + $snfs = + $this->fuckhtml + ->getElementsByAttributeName( + "data-snf", + "div" + ); + + $title = null; + $description = null; + $link = null; + $sublinks = []; + $date = null; + $thumb = [ + "ratio" => null, + "url" => null + ]; + $table = []; + + // probe for title + $title_node = + $this->fuckhtml + ->getElementsByTagName( + "h3" + ); + + if(count($title_node) !== 0){ - $delims = explode(",", $dimg[2][$i]); - $string = + // found a title node + $title = $this->fuckhtml - ->parseJsString( - $dimg[1][$i] + ->getTextContent( + $title_node[0] ); + } + + if($title === null){ - foreach($delims as $delim){ - - $this->dimg[trim($delim, "'")] = $string; - } + // should not happen + continue; } - } - } - - - private function scrape_imagearr($html){ - // get image links arrays - preg_match_all( - '/\[[0-9]+,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', - $html, - $image_arr - ); - - $this->image_arr = []; - if(isset($image_arr[1])){ - for($i=0; $ifuckhtml->load($snf); + + // probe for thumbnail + $thumbnail = $this->fuckhtml - ->parseJsString( - $image_arr[5][$i] + ->getElementsByAttributeName( + "alt", + "img" ); - if( - preg_match( - '/^x-raw-image/', - $original - ) - ){ + foreach($thumbnail as $t){ - // only add thumbnail, google doesnt have OG resolution - $this->image_arr[$image_arr[1][$i]] = [ - [ - "url" => - $this->unshit_thumb( - $this->fuckhtml - ->parseJsString( - $image_arr[2][$i] - ) - ), - "width" => (int)$image_arr[7][$i], // pass the OG image width & height - "height" => (int)$image_arr[6][$i] - ] + if( + isset($t["attributes"]["style"]) && + preg_match( + '/height ?: ?([0-9]+)px/', + $t["attributes"]["style"], + $match + ) && + (int)$match[1] < 40 + ){ + + // found a favicon, ignore + continue; + } + + $thumb = [ + "ratio" => "1:1", + "url" => + $this->fuckhtml + ->getTextContent( + $thumbnail[0]["attributes"]["src"] + ) ]; - continue; + continue 2; } - $this->image_arr[$image_arr[1][$i]] = - [ - [ - "url" => $original, - "width" => (int)$image_arr[7][$i], - "height" => (int)$image_arr[6][$i] - ], - [ - "url" => - $this->unshit_thumb( + // probe for description + if($description === null){ + + // probe 1 + if( + isset($snf["attributes"]["data-sncf"]) && + $snf["attributes"]["data-sncf"] == "1,2" + ){ + + $description = + $this->fuckhtml + ->getTextContent( + $snf + ); + continue; + } + + // probe 2 + $desc_probe = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "-webkit-line-clamp:2", + "div" + ); + + if(count($desc_probe) !== 0){ + + $description = + $this->fuckhtml + ->getTextContent( + $desc_probe[0] + ); + continue; + } + } + + // probe for links + $links = + $this->fuckhtml + ->getElementsByAttributeName( + "data-sb", + "a" + ); + + if(isset($links[0]["attributes"]["data-ved"])){ + + // found the page link + $link = + $this->fuckhtml + ->getTextContent( + $links[0]["attributes"]["href"] + ); + + continue; + } + + if(count($links) !== 0){ + + // get all sublinks + for($i=0; $i + $this->titledots( $this->fuckhtml - ->parseJsString( - $image_arr[2][$i] + ->getTextContent( + $links[$i] ) ), - "width" => (int)$image_arr[4][$i], - "height" => (int)$image_arr[3][$i] - ] - ]; - } - } - } - - - private function getdimg($dimg){ - - return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; - } - - - private function unshit_thumb($url, $get_bigger_res = false){ - // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj - // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA - - $parts = parse_url($url); - - if( - isset($parts["host"]) && - preg_match( - '/(?:encrypted-)?tbn.*\.gstatic\.com/', - $parts["host"] - ) - ){ - - parse_str($parts["query"], $params); - - if(isset($params["q"])){ + "description" => null, + "date" => null, + "url" => + $this->fuckhtml + ->getTextContent( + $links[$i]["attributes"]["href"] + ) + ]; + } + + continue; + } - if($get_bigger_res){ + // get tabloid-able data + $tabloid = + $this->fuckhtml + ->getElementsByAttributeValue( + "style", + "margin-top:0px", + "div" + ); + + if(count($tabloid) === 0){ - // this method doesnt always work, but does work for wiki thumbnails - return - "https://" . $parts["host"] . "/images?q=tbn:" . - $this->base64url_encode( - substr( - $this->base64url_decode( - explode( - ":", - $params["q"])[1] - ), - 0, - 29 - ) + // try getting instead + $tabloid = + $this->fuckhtml + ->getElementsByTagName( + "cite" ); - }else{ + } + + if(count($tabloid) !== 0){ - return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + // found table + $tabloid = + explode("·", $tabloid[0]["innerHTML"]); + + foreach($tabloid as $tbl){ + + $preg = + $this->fuckhtml + ->getTextContent( + $tbl + ); + + //$table[random_int(0,1000)] = $preg; + + if( + // match price + preg_match( + '/(\p{Sc}[^\p{Sc}]+)/', + $preg, + $match + ) + ){ + + $table["Price"] = trim($match[1]); + } + + if( + // match in stock/delivery + preg_match( + '/(stock|delivery|returns)/i', + $preg, + $match + ) + ){ + + $table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0"); + } + } + continue; } } - } - - return $url; - } - - - private function parsestyles(){ - - $styles = []; - - $style_div = - $this->fuckhtml - ->getElementsByTagName( - "style" - ); - - $raw_styles = ""; - - foreach($style_div as $style){ - - $raw_styles .= $style["innerHTML"]; - } - - // filter out media/keyframe queries - $raw_styles = - preg_replace( - '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', - "", - $raw_styles - ); - - // get styles - preg_match_all( - '/(.+?){([\S\s]*?)}/', - $raw_styles, - $matches - ); - - for($i=0; $i $value){ + if($date !== false){ - $styles[$name][$key] = $value; + $description = $description_split[1]; + }else{ + + $date = null; } } - } - - foreach($styles as $key => $values){ - $styles[$key]["_c"] = count($values); + $out["web"][] = [ + "title" => $this->titledots($title), + "description" => $this->titledots($description), + "url" => $link, + "date" => $date, + "type" => "web", + "thumb" => $thumb, + "sublink" => $sublinks, + "table" => $table + ]; } - $this->styles = $styles; - - // get CSS colors - $this->css_colors = []; - - if(isset($this->styles[":root"])){ + // get next page + if(count($out["web"]) > 5){ - foreach($this->styles[":root"] as $key => $value){ - - $this->css_colors[$value] = strtolower($key); - } - } - } - - - - private function getstyle($styles){ - - $styles["_c"] = count($styles); - - foreach($this->styles as $style_key => $style_values){ + $params["start"] += 10; - if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ - - $style_key = - explode(" ", $style_key); - - $style_key = $style_key[count($style_key) - 1]; - - return - ltrim( - str_replace( - [".", "#"], - " ", - $style_key - ) - ); - } + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); } - return false; + return $out; } - - private function getcolorvar($color){ + public function video($get){ - if(isset($this->css_colors[$color])){ + if($get["npt"]){ - return $this->css_colors[$color]; - } - - return null; - } - - - - public function web($get){ - - if($get["npt"]){ - - [$get, $proxy] = $this->backend->get($get["npt"], "web"); - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com" . $get, - [], - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } + [$params, $proxy] = $this->backend->get($get["npt"], "video"); + $params = json_decode($params, true); }else{ $search = $get["s"]; $country = $get["country"]; $nsfw = $get["nsfw"]; - $lang = $get["lang"]; $older = $get["older"]; $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; + $duration = $get["duration"]; + $quality = $get["quality"]; + $captions = $get["captions"]; $proxy = $this->backend->get_ip(); - $offset = 0; - $params = [ "q" => $search, + "udm" => "7", "hl" => "en", "num" => 20 ]; @@ -920,13 +1058,6 @@ class google{ // nsfw $params["safe"] = $nsfw == "yes" ? "off" : "active"; - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs $tbs = []; // get date @@ -943,1375 +1074,357 @@ class google{ $tbs["cd_max"] = $older; } - // spellcheck filter - if($spellcheck == "no"){ + // duration + if($duration != "any"){ - $params["nfpr"] = "1"; + $tbs[] = "dur:" . $duration; } - if(count($tbs) !== 0){ - - $params["tbs"] = ""; + // quality + if($quality != "any"){ - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } + $tbs[] = "hq:" . $quality; + } + + // captions + if($captions != "any"){ - $params["tbs"] = rtrim($params["tbs"], ","); + $tbs[] = "cc:" . $captions; } - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params, - true - ); - }catch(Exception $error){ + // append tbs + if(count($tbs) !== 0){ - throw new Exception("Failed to get HTML"); + $params["tbs"] = + implode(",", $tbs); } + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params + ); + }catch(Exception $error){ - //$html = file_get_contents("scraper/google.html"); + throw new Exception("Failed to get HTML"); } - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; + if(!isset($params["start"])){ + + $params["start"] = 0; + } + $params["start"] += 20; $this->fuckhtml->load($html); + + // + // Parse web video page + // $this->detect_sorry(); + // parse all