diff options
| author | lolcat <will@lolcat.ca> | 2026-03-18 00:01:22 -0400 |
|---|---|---|
| committer | lolcat <will@lolcat.ca> | 2026-03-18 00:01:22 -0400 |
| commit | 61548e8b84e948ef71e405a980941a9fc996ae28 (patch) | |
| tree | 9bfa394171127357efca7d981b73316f5d4e5fc2 /scraper | |
| parent | 2e5edda85b341074afd48a6a3c37ad9e2b249679 (diff) | |
fix google images
Diffstat (limited to 'scraper')
| -rw-r--r-- | scraper/google.php | 1259 |
1 files changed, 61 insertions, 1198 deletions
diff --git a/scraper/google.php b/scraper/google.php index 2f71e0e..73fd7a4 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -573,1148 +573,17 @@ class google{ public function web($get){ - // it broke again. lasted 3 months - // lets hope for another solid 3 month - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - if($get["npt"]){ - - [$get, $proxy] = $this->backend->get($get["npt"], "web"); - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com" . $get, - [], - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - }else{ - - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - $params = [ - "q" => $search, - "hl" => "en", - "udm" => 14 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ - - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params, - true - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$html = file_get_contents("scraper/google.html"); - } - - // init - $this->fuckhtml->load($html); - $this->detect_sorry(); - $this->parsestyles(); - - // get javascript images - $this->scrape_dimg($html); - $this->scrape_imagearr($html); - - // get next page - $npt = - $this->fuckhtml - ->getElementsByAttributeValue( - "aria-label", - "More search results", - "a" - ); - - if(count($npt) === 0){ - - // maybe we have the npt object from 2nd page, probe for that - $npt = - $this->fuckhtml - ->getElementsByAttributeValue( - "aria-label", - "Next page", - "a" - ); - } - - if(count($npt) !== 0){ - - $out["npt"] = - $this->backend->store( - $this->fuckhtml - ->getTextContent( - $npt[0]["attributes"]["href"] - ), - "web", - $proxy - ); - } - - // outer div is .MjjYud - // inner div always contain role="presentation" - - $outer = - $this->fuckhtml - ->getElementsByClassName( - "MjjYud", - "div" - ); - - // used later - $fancycontainer_class = - explode( - " ", - $this->getstyle([ - "padding-top" => "4px", - "padding-bottom" => "calc(12px*1)" - ]), - 2 - ); - - if(count($fancycontainer_class) === 2){ - - $fancycontainer_class = $fancycontainer_class[1]; - }else{ - - $fancycontainer_class = false; - } - - foreach($outer as $container){ - - $this->fuckhtml->load($container); - - // probe for search result - $title = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "link", - "div" - ); - - if(count($title) !== 0){ - - // we found a search result - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - - // get url - $sprobe = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $link = null; - - foreach($sprobe as $possible_link){ - - if( - isset($possible_link["attributes"]["href"]) && - preg_match( - '/^\/url\?q=/', - $possible_link["attributes"]["href"] - ) - ){ - - $link = - $this->fuckhtml - ->getTextContent( - $possible_link["attributes"]["href"] - ); - - break; - } - } - - if($link === null){ - - // should not happen - continue; - } - - // get description - // as usual, theres a thousand fucking possible divs for this one - - // probe for youtube-like description - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "align-items" => "flex-start", - "display" => "flex", - "justify-content" => "center", - "padding" => "7px 12px", - "padding-right" => "0", - "padding-top" => "0" - ]), - "div" - ); - - $ratio = "16:9"; - - if(count($description) === 0){ - - // fail. find the one with the image on the right handside - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:12px", - "div" - ); - - $ratio = "1:1"; - - if(count($description) === 0){ - - // fail. find the one that is used the most - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:3", - "div" - ); - - if(count($description) === 0){ - - // last fail. this one appears with divs that have prices - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "max-width:100vw;grid-area:nke7rc;padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:6px", - "div" - ); - } - } - } - - if(count($description) === 0){ - - // should not happen but whatever - $description = null; - }else{ - - $description = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - } - - // probe for date - $desc2 = explode("—", $description, 2); - - $time = null; - - if(count($desc2) === 2){ - - $time = strtotime($desc2[0]); - - if( - strlen($desc2[0]) < 16 && - $time !== false - ){ - - $description = ltrim($desc2[1]); - }else{ - - $time = null; - } - } - - $thumb = [ - "ratio" => null, - "url" => null, - ]; - - // get thumbnail - $images = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - foreach($images as $image){ - - if(isset($image["attributes"]["id"])){ - - $thumb = [ - "ratio" => $ratio, - "url" => $this->getdimg($image["attributes"]["id"]) - ]; - } - } - - // get sublinks - $sublinks = []; - - // probe for the fancy version - if($fancycontainer_class !== false){ - $fancycontainer = - $this->fuckhtml - ->getElementsByClassName( - $fancycontainer_class, - "div" - ); - } - - if( - $fancycontainer_class !== false && - count($fancycontainer) !== 0 - ){ - - $this->fuckhtml->load($fancycontainer[0]); - - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - foreach($as as $a){ - - $sublinks[] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $a - ), - "description" => null, - "date" => null, - "url" => - $this->unshiturl( - $a["attributes"]["href"] - ) - ]; - } - } - - $out["web"][] = [ - "title" => $title, - "description" => $description, - "url" => $this->unshiturl($link), - "date" => $time, - "type" => "web", - "thumb" => $thumb, - "sublink" => $sublinks, - "table" => [] - ]; - continue; - } - - // probe for containers with a title header - $title_header = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "display" => "flex", - "flex-wrap" => "wrap", - "position" => "relative", - "padding" => "16px" - ]) - ); - - if(count($title_header) !== 0){ - - $title_header = - strtolower( - $this->fuckhtml - ->getTextContent( - $title_header[0] - ) - ); - - switch($title_header){ - - case "people also search for": - // get all related searches - $relateds = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "display" => "flex", - "height" => "100%", - "flex-direction" => "column", - "max-width" => "100%" - ]) - ); - - foreach($relateds as $r){ - - $out["related"][] = - $this->fuckhtml - ->getTextContent( - $r - ); - } - break; - } - - continue; - } - } - - $out["related"] = array_values(array_unique($out["related"])); - - return $out; + throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now."); } public function video($get){ - - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "video"); - $params = json_decode($params, true); - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $older = $get["older"]; - $newer = $get["newer"]; - $duration = $get["duration"]; - $quality = $get["quality"]; - $captions = $get["captions"]; - $proxy = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "udm" => "7", - "hl" => "en", - "num" => 20 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // duration - if($duration != "any"){ - - $tbs[] = "dur:" . $duration; - } - - // quality - if($quality != "any"){ - - $tbs[] = "hq:" . $quality; - } - - // captions - if($captions != "any"){ - - $tbs[] = "cc:" . $captions; - } - - // append tbs - if(count($tbs) !== 0){ - - $params["tbs"] = - implode(",", $tbs); - } - } - - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - if(!isset($params["start"])){ - - $params["start"] = 0; - } - $params["start"] += 20; - - $this->fuckhtml->load($html); - - // - // Parse web video page - // - $this->detect_sorry(); - - // parse all <style> tags - $this->parsestyles(); - - // get javascript images - $this->scrape_dimg($html); - - $this->scrape_imagearr($html); - - $out = [ - "status" => "ok", - "npt" => - $this->backend->store( - json_encode($params), - "videos", - $proxy - ), - "video" => [], - "author" => [], - "livestream" => [], - "playlist" => [], - "reel" => [] - ]; - - $search_div = - $this->fuckhtml - ->getElementById( - "center_col" - ); - - if($search_div === false){ - - throw new Exception("Failed to grep search div"); - } - - $this->fuckhtml->load($search_div); - - $results = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "margin" => "0px 0px 30px" - ]), - "div" - ); - - foreach($results as $result){ - - $this->fuckhtml->load($result); - - $url = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($url) === 0){ - - // no url, weird, continue - continue; - } - - $title = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($title) === 0){ - - // no title, weird, continue - continue; - } - - // get description - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "-webkit-box-orient" => "vertical", - "display" => "-webkit-box", - "-webkit-line-clamp" => "2", - "overflow" => "hidden", - "word-break" => "break-word" - ]), - "div" - ); - - if(count($description) === 0){ - - $description = null; - }else{ - - $description = - html_entity_decode( - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ) - ); - } - - // get author + date posted - $metadiv = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "margin-top" => "12px" - ]), - "div" - ); - - $author = null; - $date = null; - - if(count($metadiv) !== 0){ - - $metadiv = - explode( - "·", - $this->fuckhtml - ->getTextContent( - $metadiv[0] - ) - ); - - if(count($metadiv) === 3){ - - $author = trim($metadiv[1]); - $date = strtotime(trim($metadiv[2])); - }elseif(count($metadiv) === 2){ - - $author = trim($metadiv[0]); - $date = strtotime(trim($metadiv[1])); - } - } - - $thumb = [ - "url" => null, - "ratio" => null - ]; - - $image = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - $duration = null; - - if( - count($image) !== 0 && - isset($image[0]["attributes"]["id"]) - ){ - - $thumb = [ - "url" => $this->getdimg($image[0]["attributes"]["id"]), - "ratio" => "16:9" - ]; - - // get duration - $duration = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle([ - "background-color" => "rgba(0,0,0,0.6)", - "color" => "#fff", - "fill" => "#fff" - ]) - ); - - if(count($duration) !== 0){ - - $duration = - $this->hms2int( - $this->fuckhtml - ->getTextContent( - $duration[0] - )); - }else{ - - $duration = null; - } - } - - $out["video"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ), - "description" => $description, - "author" => [ - "name" => $author, - "url" => null, - "avatar" => null - ], - "date" => $date, - "duration" => $duration, - "views" => null, - "thumb" => $thumb, - "url" => - $this->fuckhtml - ->getTextContent( - $url[0]["attributes"]["href"] - ) - ]; - } - - return $out; + throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now."); } public function news($get){ - - throw new Exception("Broke for now, fuck off lol"); - - if($get["npt"]){ - - [$req, $proxy] = $this->backend->get($get["npt"], "news"); - /*parse_str( - parse_url($req, PHP_URL_QUERY), - $search - );*/ - - try{ - - $html = - $this->get( - $proxy, - "https://www.google.com" . $req, - [] - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $older = $get["older"]; - $newer = $get["newer"]; - $sort = $get["sort"]; - $proxy = $this->backend->get_ip(); - - $params = [ - "q" => $search, - "tbm" => "nws", - "hl" => "en", - "num" => "20" - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // relevance - if($sort == "date"){ - - $tbs["sbd"] = "1"; - } - - // append tbs - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - //$html = file_get_contents("scraper/google-news.html"); - - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - } - - $out = [ - "status" => "ok", - "npt" => null, - "news" => [] - ]; - - $this->fuckhtml->load($html); - - $this->detect_sorry(); - - // get images - $this->scrape_dimg($html); - - // parse styles - $this->parsestyles(); - - $center_col = - $this->fuckhtml - ->getElementById( - "center_col", - "div" - ); - - if($center_col === null){ - - throw new Exception("Could not grep result div"); - } - - $this->fuckhtml->load($center_col); - - // get next page - $npt = - $this->fuckhtml - ->getElementById( - "pnnext", - "a" - ); - - if($npt !== false){ - - $out["npt"] = - $this->backend->store( - $this->fuckhtml - ->getTextContent( - $npt["attributes"] - ["href"] - ), - "news", - $proxy - ); - } - - $as = - $this->fuckhtml - ->getElementsByAttributeName( - "jsname", - "a" - ); - - foreach($as as $a){ - - $this->fuckhtml->load($a); - - // get title - $title = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "heading", - "div" - ); - - if(count($title) === 0){ - - continue; - } - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - - // get thumbnail - $image = - $this->fuckhtml - ->getElementsByAttributeName( - "id", - "img" - ); - - // check for padded title node, if found, we're inside a carousel - $probe = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "padding" => "16px 16px 40px 16px" - ] - ), - "div" - ); - - if(count($probe) !== 0){ - - $probe = true; - }else{ - - $probe = false; - } - - if( - count($image) !== 0 && - !isset($image[0]["attributes"]["width"]) - ){ - - $thumb = [ - "url" => - $this->getdimg( - $image[0]["attributes"]["id"] - ), - "ratio" => $probe === true ? "16:9" : "1:1" - ]; - }else{ - - $thumb = [ - "url" => null, - "ratio" => null - ]; - } - - $description = null; - - if($probe === false){ - - $desc_divs = - $this->fuckhtml - ->getElementsByAttributeName( - "style", - "div" - ); - - foreach($desc_divs as $desc){ - - if( - strpos( - $desc["attributes"]["style"], - "margin-top:" - ) !== false - ){ - - $description = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $desc - ) - ); - break; - } - } - } - - // get author - $author = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "overflow" => "hidden", - "text-align" => "left", - "text-overflow" => "ellipsis", - "white-space" => "nowrap", - "margin-bottom" => "8px" - ] - ), - "div" - ); - - if(count($author) !== 0){ - - $author = - $this->fuckhtml - ->getTextContent( - $author[0] - ); - }else{ - - $author = null; - } - - // get date - $date = null; - - $date_div = - $this->fuckhtml - ->getElementsByAttributeName( - "style", - "div" - ); - - foreach($date_div as $d){ - - $this->fuckhtml->load($d); - - $span = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - if( - strpos( - $d["attributes"]["style"], - "bottom:" - ) !== false - ){ - - $date = - strtotime( - $this->fuckhtml - ->getTextContent( - $span[count($span) - 1] - ) - ); - break; - } - } - - $out["news"][] = [ - "title" => $title, - "author" => $author, - "description" => $description, - "date" => $date, - "thumb" => $thumb, - "url" => - $this->unshiturl( - $a["attributes"] - ["href"] - ) - ]; - } - - return $out; + throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now."); } @@ -1730,6 +599,11 @@ class google{ ); $params = json_decode($params, true); + + $page = $params["page"] + 1; + $params = $params["params"]; + $params["async"] = "_fmt:json,p:1,ijn:{$page}"; + }else{ $search = $get["s"]; @@ -1749,9 +623,13 @@ class google{ $format = $get["format"]; $rights = $get["rights"]; + $page = 0; + $params = [ "q" => $search, - "udm" => "2" // get images + "tbm" => "isch", + "asearch" => "isch", + "async" => "_fmt:json,p:1,ijn:{$page}", // ijn:0 = page 1 ]; // country (image search uses cr instead of gl) @@ -1834,13 +712,9 @@ class google{ $params["tbs"] = rtrim($params["tbs"], ","); } } - /* - $handle = fopen("scraper/page.html", "r"); - $html = fread($handle, filesize("scraper/page.html")); - fclose($handle);*/ try{ - $html = + $json = $this->get( $proxy, "https://www.google.com/search", @@ -1851,12 +725,28 @@ class google{ throw new Exception("Failed to get search page"); } - $this->fuckhtml->load($html); + unset($params["async"]); + + //$json = file_get_contents("scraper/google.json"); + // detect captcha + $this->fuckhtml->load($json); $this->detect_sorry(); - // get javascript images - $this->scrape_imagearr($html); + // remove xssi + $json = + preg_replace( + '/^[^{]*/', + "", + $json + ); + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } $out = [ "status" => "ok", @@ -1864,69 +754,42 @@ class google{ "image" => [] ]; - $images = - $this->fuckhtml - ->getElementsByClassName( - "ivg-i", - "div" - ); - - foreach($images as $div){ - - $this->fuckhtml->load($div); - - $image = - $this->fuckhtml - ->getElementsByTagName("img")[0]; + if(!isset($json["ischj"]["metadata"])){ - // make sure we dont attempt to show an image we dont have data for - if( - isset($div["attributes"]["data-docid"]) && - isset($this->image_arr[$div["attributes"]["data-docid"]]) - ){ - - $source = - $this->image_arr[ - $div["attributes"]["data-docid"] - ]; - }else{ - - continue; - } + throw new Exception("Google did not return an image array"); + } + + foreach($json["ischj"]["metadata"] as $image){ $out["image"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $image["attributes"]["alt"] - ) - ), - "source" => $source, - "url" => - $this->fuckhtml - ->getTextContent( - $div["attributes"]["data-lpage"] - ) + "title" => $this->titledots($image["result"]["page_title"]), + "source" => [ + [ + "url" => $image["original_image"]["url"], + "width" => (int)$image["original_image"]["width"], + "height" => (int)$image["original_image"]["height"] + ], + [ + "url" => $image["thumbnail"]["url"], + "width" => (int)$image["thumbnail"]["width"], + "height" => (int)$image["thumbnail"]["height"] + ] + ], + "url" => $image["result"]["referrer_url"] ]; } - // as usual, no way to check if there is a next page reliably - if(count($out["image"]) > 50){ - - if(!isset($params["start"])){ - - $params["start"] = 10; - }else{ - - $params["start"] += 10; - } + $page++; + + if(count($out["image"]) === 10){ $out["npt"] = - $this->backend - ->store( - json_encode($params), - "image", + $this->backend->store( + json_encode([ + "params" => $params, + "page" => $page + ]), + "images", $proxy ); } |
