diff options
| author | lolcat <will@lolcat.ca> | 2025-10-08 00:42:36 -0400 |
|---|---|---|
| committer | lolcat <will@lolcat.ca> | 2025-10-08 00:42:36 -0400 |
| commit | a4a44709b4ee1dffaca8b7f79b3c0814914a58f7 (patch) | |
| tree | 1105658aaafdaec94205a3d1c032ba967fd5ac20 /scraper/google.php | |
| parent | 4b16fd58971321e6938046702210e2773573c621 (diff) | |
google quote on quote fix
Diffstat (limited to 'scraper/google.php')
| -rw-r--r-- | scraper/google.php | 461 |
1 files changed, 1 insertions, 460 deletions
diff --git a/scraper/google.php b/scraper/google.php index c83b084..049c844 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -561,466 +561,7 @@ class google{ public function web($get){ - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "web"); - - $params = json_decode($params, true); - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - /* - https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - - https://www.google.com/search?udm=14& - yv=3& - q=asmr& - biw=1920& - bih=947& - start=0& - sa=N& - asearch=arc& - cs=1& - async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - */ - - $params = [ - "udm" => 14, - "yv" => 3, - "q" => $search, - "biw" => 1920, - "bih" => 947, - "start" => 0, - "sa" => "N", - "asearch" => "arc", - "cs" => 1, - "async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc", - "hl" => "en", - "num" => 20 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ - - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$json = file_get_contents("scraper/google.js"); - } - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $this->fuckhtml->load($json); - $this->detect_sorry(); - - // get next page - /* - $npt = - $this->fuckhtml - ->getElementsByAttributeName( - "data-state-token", - "div" - ); - - if(count($npt) !== 0){ - - $params["sstk"] = - $this->fuckhtml - ->getTextContent( - $npt[0]["attributes"]["data-state-token"] - ); - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - }*/ - - // get invididual results - $results = - $this->fuckhtml - ->getElementsByAttributeName( - "data-hveid", - "div" - ); - - foreach($results as $result){ - - $this->fuckhtml->load($result); - - //echo $result["innerHTML"]; - - $snfs = - $this->fuckhtml - ->getElementsByAttributeName( - "data-snf", - "div" - ); - - $title = null; - $description = null; - $link = null; - $sublinks = []; - $date = null; - $thumb = [ - "ratio" => null, - "url" => null - ]; - $table = []; - - // probe for title - $title_node = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($title_node) !== 0){ - - // found a title node - $title = - $this->fuckhtml - ->getTextContent( - $title_node[0] - ); - } - - if($title === null){ - - // should not happen - continue; - } - - foreach($snfs as $snf){ - - $this->fuckhtml->load($snf); - - // probe for thumbnail - $thumbnail = - $this->fuckhtml - ->getElementsByAttributeName( - "alt", - "img" - ); - - foreach($thumbnail as $t){ - - if( - isset($t["attributes"]["style"]) && - preg_match( - '/height ?: ?([0-9]+)px/', - $t["attributes"]["style"], - $match - ) && - (int)$match[1] < 40 - ){ - - // found a favicon, ignore - continue; - } - - $thumb = [ - "ratio" => "1:1", - "url" => - $this->fuckhtml - ->getTextContent( - $thumbnail[0]["attributes"]["src"] - ) - ]; - - continue 2; - } - - // probe for description - if($description === null){ - - // probe 1 - if( - isset($snf["attributes"]["data-sncf"]) && - $snf["attributes"]["data-sncf"] == "1,2" - ){ - - $description = - $this->fuckhtml - ->getTextContent( - $snf - ); - continue; - } - - // probe 2 - $desc_probe = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:2", - "div" - ); - - if(count($desc_probe) !== 0){ - - $description = - $this->fuckhtml - ->getTextContent( - $desc_probe[0] - ); - continue; - } - } - - // probe for links - $links = - $this->fuckhtml - ->getElementsByAttributeName( - "data-sb", - "a" - ); - - if(isset($links[0]["attributes"]["data-ved"])){ - - // found the page link - $link = - $this->fuckhtml - ->getTextContent( - $links[0]["attributes"]["href"] - ); - - continue; - } - - if(count($links) !== 0){ - - // get all sublinks - for($i=0; $i<count($links); $i++){ - - $sublinks[] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $links[$i] - ) - ), - "description" => null, - "date" => null, - "url" => - $this->fuckhtml - ->getTextContent( - $links[$i]["attributes"]["href"] - ) - ]; - } - - continue; - } - - // get tabloid-able data - $tabloid = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "margin-top:0px", - "div" - ); - - if(count($tabloid) === 0){ - - // try getting <cite> instead - $tabloid = - $this->fuckhtml - ->getElementsByTagName( - "cite" - ); - } - - if(count($tabloid) !== 0){ - - // found table - $tabloid = - explode("·", $tabloid[0]["innerHTML"]); - - foreach($tabloid as $tbl){ - - $preg = - $this->fuckhtml - ->getTextContent( - $tbl - ); - - //$table[random_int(0,1000)] = $preg; - - if( - // match price - preg_match( - '/(\p{Sc}[^\p{Sc}]+)/', - $preg, - $match - ) - ){ - - $table["Price"] = trim($match[1]); - } - - if( - // match in stock/delivery - preg_match( - '/(stock|delivery|returns)/i', - $preg, - $match - ) - ){ - - $table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0"); - } - } - continue; - } - } - - // extract date from description - $description_split = - explode( - "—", $description, 2 - ); - - if(count($description_split) === 1){ - - $description = $description_split[0]; - }elseif(strlen($description_split[0]) < 17){ - - $date = strtotime($description_split[0]); - - if($date !== false){ - - $description = $description_split[1]; - }else{ - - $date = null; - } - } - - $out["web"][] = [ - "title" => $this->titledots($title), - "description" => $this->titledots($description), - "url" => $link, - "date" => $date, - "type" => "web", - "thumb" => $thumb, - "sublink" => $sublinks, - "table" => $table - ]; - } - - // get next page - if(count($out["web"]) > 5){ - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - } - - return $out; + throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers."); } |
