fix google images

author: lolcat <will@lolcat.ca> 2026-03-18 00:01:22 -0400
committer: lolcat <will@lolcat.ca> 2026-03-18 00:01:22 -0400
commit: 61548e8b84e948ef71e405a980941a9fc996ae28 (patch)
tree: 9bfa394171127357efca7d981b73316f5d4e5fc2 /scraper
parent: 2e5edda85b341074afd48a6a3c37ad9e2b249679 (diff)
1 files changed, 61 insertions, 1198 deletions
diff --git a/scraper/google.php b/scraper/google.php
index 2f71e0e..73fd7a4 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -573,1148 +573,17 @@ class google{
 	
 	public function web($get){
 		
-		// it broke again. lasted 3 months
-		// lets hope for another solid 3 month
-		
-		$out = [
-			"status" => "ok",
-			"spelling" => [
-				"type" => "no_correction",
-				"using" => null,
-				"correction" => null
-			],
-			"npt" => null,
-			"answer" => [],
-			"web" => [],
-			"image" => [],
-			"video" => [],
-			"news" => [],
-			"related" => []
-		];
-		
-		if($get["npt"]){
-			
-			[$get, $proxy] = $this->backend->get($get["npt"], "web");
-			
-			try{
-				$html =
-					$this->get(
-						$proxy,
-						"https://www.google.com" . $get,
-						[],
-						true
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to get HTML");
-			}
-		}else{
-			
-			$search = $get["s"];
-			$country = $get["country"];
-			$nsfw = $get["nsfw"];
-			$lang = $get["lang"];
-			$older = $get["older"];
-			$newer = $get["newer"];
-			$spellcheck = $get["spellcheck"];
-			$proxy = $this->backend->get_ip();
-			
-			$offset = 0;
-			
-			$params = [
-				"q" => $search,
-				"hl" => "en",
-				"udm" => 14
-			];
-			
-			// country
-			if($country != "any"){
-				
-				$params["gl"] = $country;
-			}
-			
-			// nsfw
-			$params["safe"] = $nsfw == "yes" ? "off" : "active";
-			
-			// language
-			if($lang != "any"){
-				
-				$params["lr"] = "lang_" . $lang;
-			}
-			
-			// generate tbs
-			$tbs = [];
-			
-			// get date
-			$older = $older === false ? null : date("m/d/Y", $older);
-			$newer = $newer === false ? null : date("m/d/Y", $newer);
-			
-			if(
-				$older !== null ||
-				$newer !== null
-			){
-				
-				$tbs["cdr"] = "1";
-				$tbs["cd_min"] = $newer;
-				$tbs["cd_max"] = $older;
-			}
-			
-			// spellcheck filter
-			if($spellcheck == "no"){
-				
-				$params["nfpr"] = "1";
-			}
-			
-			if(count($tbs) !== 0){
-				
-				$params["tbs"] = "";
-				
-				foreach($tbs as $key => $value){
-					
-					$params["tbs"] .= $key . ":" . $value . ",";
-				}
-				
-				$params["tbs"] = rtrim($params["tbs"], ",");
-			}
-			
-			try{
-				$html =
-					$this->get(
-						$proxy,
-						"https://www.google.com/search",
-						$params,
-						true
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to get HTML");
-			}
-			
-			//$html = file_get_contents("scraper/google.html");
-		}
-		
-		// init
-		$this->fuckhtml->load($html);
-		$this->detect_sorry();
-		$this->parsestyles();
-		
-		// get javascript images
-		$this->scrape_dimg($html);
-		$this->scrape_imagearr($html);
-		
-		// get next page
-		$npt =
-			$this->fuckhtml
-			->getElementsByAttributeValue(
-				"aria-label",
-				"More search results",
-				"a"
-			);
-		
-		if(count($npt) === 0){
-			
-			// maybe we have the npt object from 2nd page, probe for that
-			$npt =
-				$this->fuckhtml
-				->getElementsByAttributeValue(
-					"aria-label",
-					"Next page",
-					"a"
-				);
-		}
-		
-		if(count($npt) !== 0){
-			
-			$out["npt"] =
-				$this->backend->store(
-					$this->fuckhtml
-					->getTextContent(
-						$npt[0]["attributes"]["href"]
-					),
-					"web",
-					$proxy
-				);
-		}
-		
-		// outer div is .MjjYud
-		// inner div always contain role="presentation"
-		
-		$outer =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"MjjYud",
-				"div"
-			);
-		
-		// used later
-		$fancycontainer_class =
-			explode(
-				" ",
-				$this->getstyle([
-					"padding-top" => "4px",
-					"padding-bottom" => "calc(12px*1)"
-				]),
-				2
-			);
-		
-		if(count($fancycontainer_class) === 2){
-			
-			$fancycontainer_class = $fancycontainer_class[1];
-		}else{
-			
-			$fancycontainer_class = false;
-		}
-		
-		foreach($outer as $container){
-			
-			$this->fuckhtml->load($container);
-			
-			// probe for search result
-			$title =
-				$this->fuckhtml
-				->getElementsByAttributeValue(
-					"role",
-					"link",
-					"div"
-				);
-			
-			if(count($title) !== 0){
-				
-				// we found a search result
-				
-				$title =
-					$this->titledots(
-						$this->fuckhtml
-						->getTextContent(
-							$title[0]
-						)
-					);
-				
-				// get url
-				$sprobe =
-					$this->fuckhtml
-					->getElementsByTagName(
-						"a"
-					);
-				
-				$link = null;
-				
-				foreach($sprobe as $possible_link){
-					
-					if(
-						isset($possible_link["attributes"]["href"]) &&
-						preg_match(
-							'/^\/url\?q=/',
-							$possible_link["attributes"]["href"]
-						)
-					){
-						
-						$link =
-							$this->fuckhtml
-							->getTextContent(
-								$possible_link["attributes"]["href"]
-							);
-						
-						break;
-					}
-				}
-				
-				if($link === null){
-					
-					// should not happen
-					continue;
-				}
-				
-				// get description
-				// as usual, theres a thousand fucking possible divs for this one
-				
-				// probe for youtube-like description
-				$description =
-					$this->fuckhtml
-					->getElementsByClassName( 
-						$this->getstyle([
-							"align-items" => "flex-start",
-							"display" => "flex",
-							"justify-content" => "center",
-							"padding" => "7px 12px",
-							"padding-right" => "0",
-							"padding-top" => "0"
-						]),
-						"div"
-					);
-				
-				$ratio = "16:9";
-				
-				if(count($description) === 0){
-					
-					// fail. find the one with the image on the right handside
-					$description =
-						$this->fuckhtml
-						->getElementsByAttributeValue(
-							"style",
-							"padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:12px",
-							"div"
-						);
-					
-					$ratio = "1:1";
-					
-					if(count($description) === 0){
-						
-						// fail. find the one that is used the most
-						$description =
-							$this->fuckhtml
-							->getElementsByAttributeValue(
-								"style",
-								"-webkit-line-clamp:3",
-								"div"
-							);
-						
-						if(count($description) === 0){
-							
-							// last fail. this one appears with divs that have prices
-							$description =
-								$this->fuckhtml
-								->getElementsByAttributeValue(
-									"style",
-									"max-width:100vw;grid-area:nke7rc;padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:6px",
-									"div"
-								);
-						}
-					}
-				}
-				
-				if(count($description) === 0){
-					
-					// should not happen but whatever
-					$description = null;
-				}else{
-					
-					$description =
-						$this->titledots(
-							$this->fuckhtml
-							->getTextContent(
-								$description[0]
-							)
-						);
-				}
-				
-				// probe for date
-				$desc2 = explode("—", $description, 2);
-				
-				$time = null;
-				
-				if(count($desc2) === 2){
-					
-					$time = strtotime($desc2[0]);
-					
-					if(
-						strlen($desc2[0]) < 16 &&
-						$time !== false
-					){
-						
-						$description = ltrim($desc2[1]);
-					}else{
-						
-						$time = null;
-					}
-				}
-				
-				$thumb = [
-					"ratio" => null,
-					"url" => null,
-				];
-				
-				// get thumbnail
-				$images =
-					$this->fuckhtml
-					->getElementsByTagName(
-						"img"
-					);
-				
-				foreach($images as $image){
-					
-					if(isset($image["attributes"]["id"])){
-						
-						$thumb = [
-							"ratio" => $ratio,
-							"url" => $this->getdimg($image["attributes"]["id"])
-						];
-					}
-				}
-				
-				// get sublinks
-				$sublinks = [];
-				
-				// probe for the fancy version
-				if($fancycontainer_class !== false){
-					$fancycontainer =
-						$this->fuckhtml
-						->getElementsByClassName(
-							$fancycontainer_class,
-							"div"
-						);
-				}
-				
-				if(
-					$fancycontainer_class !== false &&
-					count($fancycontainer) !== 0
-				){
-					
-					$this->fuckhtml->load($fancycontainer[0]);
-					
-					$as =
-						$this->fuckhtml
-						->getElementsByTagName(
-							"a"
-						);
-					
-					foreach($as as $a){
-						
-						$sublinks[] = [
-							"title" =>
-								$this->fuckhtml
-								->getTextContent(
-									$a
-								),
-							"description" => null,
-							"date" => null,
-							"url" =>
-								$this->unshiturl(
-									$a["attributes"]["href"]
-								)
-						];
-					}
-				}
-				
-				$out["web"][] = [
-					"title" => $title,
-					"description" => $description,
-					"url" => $this->unshiturl($link),
-					"date" => $time,
-					"type" => "web",
-					"thumb" => $thumb,
-					"sublink" => $sublinks,
-					"table" => []
-				];
-				continue;
-			}
-			
-			// probe for containers with a title header
-			$title_header =
-				$this->fuckhtml
-				->getElementsByClassName(
-					$this->getstyle([
-						"display" => "flex",
-						"flex-wrap" => "wrap",
-						"position" => "relative",
-						"padding" => "16px"
-					])
-				);
-			
-			if(count($title_header) !== 0){
-				
-				$title_header =
-					strtolower(
-						$this->fuckhtml
-						->getTextContent(
-							$title_header[0]
-						)
-					);
-				
-				switch($title_header){
-					
-					case "people also search for":
-						// get all related searches
-						$relateds =
-							$this->fuckhtml
-							->getElementsByClassName(
-								$this->getstyle([
-									"display" => "flex",
-									"height" => "100%",
-									"flex-direction" => "column",
-									"max-width" => "100%"
-								])
-							);
-						
-						foreach($relateds as $r){
-							
-							$out["related"][] =
-								$this->fuckhtml
-								->getTextContent(
-									$r
-								);
-						}
-						break;
-				}
-				
-				continue;
-			}
-		}
-		
-		$out["related"] = array_values(array_unique($out["related"]));
-		
-		return $out;
+		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
 	}
 	
 	
 	public function video($get){
-		
-		if($get["npt"]){
-			
-			[$params, $proxy] = $this->backend->get($get["npt"], "video");
-			$params = json_decode($params, true);
-			
-		}else{
-			$search = $get["s"];
-			$country = $get["country"];
-			$nsfw = $get["nsfw"];
-			$older = $get["older"];
-			$newer = $get["newer"];
-			$duration = $get["duration"];
-			$quality = $get["quality"];
-			$captions = $get["captions"];
-			$proxy = $this->backend->get_ip();
-			
-			$params = [
-				"q" => $search,
-				"udm" => "7",
-				"hl" => "en",
-				"num" => 20
-			];
-			
-			// country
-			if($country != "any"){
-				
-				$params["gl"] = $country;
-			}
-			
-			// nsfw
-			$params["safe"] = $nsfw == "yes" ? "off" : "active";
-			
-			$tbs = [];
-			
-			// get date
-			$older = $older === false ? null : date("m/d/Y", $older);
-			$newer = $newer === false ? null : date("m/d/Y", $newer);
-			
-			if(
-				$older !== null ||
-				$newer !== null
-			){
-				
-				$tbs["cdr"] = "1";
-				$tbs["cd_min"] = $newer;
-				$tbs["cd_max"] = $older;
-			}
-			
-			// duration
-			if($duration != "any"){
-				
-				$tbs[] = "dur:" . $duration;
-			}
-			
-			// quality
-			if($quality != "any"){
-				
-				$tbs[] = "hq:" . $quality;
-			}
-			
-			// captions
-			if($captions != "any"){
-				
-				$tbs[] = "cc:" . $captions;
-			}
-			
-			// append tbs
-			if(count($tbs) !== 0){
-				
-				$params["tbs"] =
-					implode(",", $tbs);
-			}
-		}
-		
-		try{
-			$html =
-				$this->get(
-					$proxy,
-					"https://www.google.com/search",
-					$params
-				);
-		}catch(Exception $error){
-			
-			throw new Exception("Failed to get HTML");
-		}
-		
-		if(!isset($params["start"])){
-			
-			$params["start"] = 0;
-		}
-		$params["start"] += 20;
-		
-		$this->fuckhtml->load($html);
-		
-		//
-		// Parse web video page
-		//
-		$this->detect_sorry();
-		
-		// parse all <style> tags
-		$this->parsestyles();
-		
-		// get javascript images
-		$this->scrape_dimg($html);
-		
-		$this->scrape_imagearr($html);
-		
-		$out = [
-			"status" => "ok",
-			"npt" =>
-				$this->backend->store(
-					json_encode($params),
-					"videos",
-					$proxy
-				),
-			"video" => [],
-			"author" => [],
-			"livestream" => [],
-			"playlist" => [],
-			"reel" => []
-		];
-		
-		$search_div =
-			$this->fuckhtml
-			->getElementById(
-				"center_col"
-			);
-		
-		if($search_div === false){
-			
-			throw new Exception("Failed to grep search div");
-		}
-		
-		$this->fuckhtml->load($search_div);
-		
-		$results =
-			$this->fuckhtml
-			->getElementsByClassName(
-				$this->getstyle([
-					"margin" => "0px 0px 30px"
-				]),
-				"div"
-			);
-		
-		foreach($results as $result){
-			
-			$this->fuckhtml->load($result);
-			
-			$url =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"a"
-				);
-			
-			if(count($url) === 0){
-				
-				// no url, weird, continue
-				continue;
-			}
-			
-			$title =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"h3"
-				);
-			
-			if(count($title) === 0){
-				
-				// no title, weird, continue
-				continue;
-			}
-			
-			// get description
-			$description =
-				$this->fuckhtml
-				->getElementsByClassName(
-					$this->getstyle([
-						"-webkit-box-orient" => "vertical",
-						"display" => "-webkit-box",
-						"-webkit-line-clamp" => "2",
-						"overflow" => "hidden",
-						"word-break" => "break-word"
-					]),
-					"div"
-				);
-			
-			if(count($description) === 0){
-				
-				$description = null;
-			}else{
-				
-				$description =
-					html_entity_decode(
-						$this->titledots(
-							$this->fuckhtml
-							->getTextContent(
-								$description[0]
-							)
-						)
-					);
-			}
-			
-			// get author + date posted
-			$metadiv =
-				$this->fuckhtml
-				->getElementsByClassName(
-					$this->getstyle([
-						"margin-top" => "12px"
-					]),
-					"div"
-				);
-			
-			$author = null;
-			$date = null;
-			
-			if(count($metadiv) !== 0){
-				
-				$metadiv =
-					explode(
-						"·",
-						$this->fuckhtml
-						->getTextContent(
-							$metadiv[0]
-						)
-					);
-				
-				if(count($metadiv) === 3){
-					
-					$author = trim($metadiv[1]);
-					$date = strtotime(trim($metadiv[2]));
-				}elseif(count($metadiv) === 2){
-					
-					$author = trim($metadiv[0]);
-					$date = strtotime(trim($metadiv[1]));
-				}
-			}
-			
-			$thumb = [
-				"url" => null,
-				"ratio" => null
-			];
-			
-			$image =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"img"
-				);
-			
-			$duration = null;
-			
-			if(
-				count($image) !== 0 &&
-				isset($image[0]["attributes"]["id"])
-			){
-				
-				$thumb = [
-					"url" => $this->getdimg($image[0]["attributes"]["id"]),
-					"ratio" => "16:9"
-				];
-				
-				// get duration
-				$duration =
-					$this->fuckhtml
-					->getElementsByClassName(
-						$this->getstyle([
-							"background-color" => "rgba(0,0,0,0.6)",
-							"color" => "#fff",
-							"fill" => "#fff"
-						])
-					);
-				
-				if(count($duration) !== 0){
-					
-					$duration =
-						$this->hms2int(
-							$this->fuckhtml
-							->getTextContent(
-								$duration[0]
-							));
-				}else{
-					
-					$duration = null;
-				}
-			}
-			
-			$out["video"][] = [
-				"title" =>
-					$this->titledots(
-						$this->fuckhtml
-						->getTextContent(
-							$title[0]
-						)
-					),
-				"description" => $description,
-				"author" => [
-					"name" => $author,
-					"url" => null,
-					"avatar" => null
-				],
-				"date" => $date,
-				"duration" => $duration,
-				"views" => null,
-				"thumb" => $thumb,
-				"url" =>
-					$this->fuckhtml
-					->getTextContent(
-						$url[0]["attributes"]["href"]
-					)
-			];
-		}
-		
-		return $out;
+		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
 	}
 	
 	
 	public function news($get){
-		
-		throw new Exception("Broke for now, fuck off lol");
-		
-		if($get["npt"]){
-			
-			[$req, $proxy] = $this->backend->get($get["npt"], "news");
-			/*parse_str(
-				parse_url($req, PHP_URL_QUERY),
-				$search
-			);*/
-			
-			try{
-				
-				$html =
-					$this->get(
-						$proxy,
-						"https://www.google.com" . $req,
-						[]
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to get HTML");
-			}
-			
-		}else{
-			$search = $get["s"];
-			$country = $get["country"];
-			$nsfw = $get["nsfw"];
-			$older = $get["older"];
-			$newer = $get["newer"];
-			$sort = $get["sort"];
-			$proxy = $this->backend->get_ip();
-			
-			$params = [
-				"q" => $search,
-				"tbm" => "nws",
-				"hl" => "en",
-				"num" => "20"
-			];
-			
-			// country
-			if($country != "any"){
-				
-				$params["gl"] = $country;
-			}
-			
-			// nsfw
-			$params["safe"] = $nsfw == "yes" ? "off" : "active";
-			
-			$tbs = [];
-			
-			// get date
-			$older = $older === false ? null : date("m/d/Y", $older);
-			$newer = $newer === false ? null : date("m/d/Y", $newer);
-			
-			if(
-				$older !== null ||
-				$newer !== null
-			){
-				
-				$tbs["cdr"] = "1";
-				$tbs["cd_min"] = $newer;
-				$tbs["cd_max"] = $older;
-			}
-			
-			// relevance
-			if($sort == "date"){
-				
-				$tbs["sbd"] = "1";
-			}
-					
-			// append tbs
-			if(count($tbs) !== 0){
-				
-				$params["tbs"] = "";
-				
-				foreach($tbs as $key => $value){
-					
-					$params["tbs"] .= $key . ":" . $value . ",";
-				}
-				
-				$params["tbs"] = rtrim($params["tbs"], ",");
-			}
-			
-			//$html = file_get_contents("scraper/google-news.html");
-			
-			$html =
-				$this->get(
-					$proxy,
-					"https://www.google.com/search",
-					$params
-				);
-		}
-		
-		$out = [
-			"status" => "ok",
-			"npt" => null,
-			"news" => []
-		];
-		
-		$this->fuckhtml->load($html);
-		
-		$this->detect_sorry();
-		
-		// get images
-		$this->scrape_dimg($html);
-		
-		// parse styles
-		$this->parsestyles();
-		
-		$center_col =
-			$this->fuckhtml
-			->getElementById(
-				"center_col",
-				"div"
-			);
-		
-		if($center_col === null){
-			
-			throw new Exception("Could not grep result div");
-		}
-		
-		$this->fuckhtml->load($center_col);
-		
-		// get next page
-		$npt =
-			$this->fuckhtml
-			->getElementById(
-				"pnnext",
-				"a"
-			);
-		
-		if($npt !== false){
-			
-			$out["npt"] =
-				$this->backend->store(
-					$this->fuckhtml
-					->getTextContent(
-						$npt["attributes"]
-						["href"]
-					),
-					"news",
-					$proxy
-				);
-		}
-		
-		$as =
-			$this->fuckhtml
-			->getElementsByAttributeName(
-				"jsname",
-				"a"
-			);
-		
-		foreach($as as $a){
-			
-			$this->fuckhtml->load($a);
-			
-			// get title
-			$title =
-				$this->fuckhtml
-				->getElementsByAttributeValue(
-					"role",
-					"heading",
-					"div"
-				);
-			
-			if(count($title) === 0){
-				
-				continue;
-			}
-			
-			$title =
-				$this->titledots(
-					$this->fuckhtml
-					->getTextContent(
-						$title[0]
-					)
-				);
-			
-			// get thumbnail
-			$image =
-				$this->fuckhtml
-				->getElementsByAttributeName(
-					"id",
-					"img"
-				);
-			
-			// check for padded title node, if found, we're inside a carousel
-			$probe =
-				$this->fuckhtml
-				->getElementsByClassName(
-					$this->getstyle(
-						[
-							"padding" => "16px 16px 40px 16px"
-						]
-					),
-					"div"
-				);
-			
-			if(count($probe) !== 0){
-				
-				$probe = true;
-			}else{
-				
-				$probe = false;
-			}
-			
-			if(
-				count($image) !== 0 &&
-				!isset($image[0]["attributes"]["width"])
-			){
-				
-				$thumb = [
-					"url" =>
-						$this->getdimg(
-							$image[0]["attributes"]["id"]
-						),
-					"ratio" => $probe === true ? "16:9" : "1:1"
-				];
-			}else{
-				
-				$thumb = [
-					"url" => null,
-					"ratio" => null
-				];
-			}
-			
-			$description = null;
-			
-			if($probe === false){
-				
-				$desc_divs =
-					$this->fuckhtml
-					->getElementsByAttributeName(
-						"style",
-						"div"
-					);
-				
-				foreach($desc_divs as $desc){
-					
-					if(
-						strpos(
-							$desc["attributes"]["style"],
-							"margin-top:"
-						) !== false
-					){
-						
-						$description =
-							$this->titledots(
-								$this->fuckhtml
-								->getTextContent(
-									$desc
-								)
-							);
-						break;
-					}
-				}
-			}
-			
-			// get author
-			$author =
-				$this->fuckhtml
-				->getElementsByClassName(
-					$this->getstyle(
-						[
-							"overflow" => "hidden",
-							"text-align" => "left",
-							"text-overflow" => "ellipsis",
-							"white-space" => "nowrap",
-							"margin-bottom" => "8px"
-						]
-					),
-					"div"
-				);
-			
-			if(count($author) !== 0){
-				
-				$author =
-					$this->fuckhtml
-					->getTextContent(
-						$author[0]
-					);
-			}else{
-				
-				$author = null;
-			}
-			
-			// get date
-			$date = null;
-			
-			$date_div =
-				$this->fuckhtml
-				->getElementsByAttributeName(
-					"style",
-					"div"
-				);
-			
-			foreach($date_div as $d){
-				
-				$this->fuckhtml->load($d);
-				
-				$span =
-					$this->fuckhtml
-					->getElementsByTagName(
-						"span"
-					);
-				
-				if(
-					strpos(
-						$d["attributes"]["style"],
-						"bottom:"
-					) !== false
-				){
-					
-					$date =
-						strtotime(
-							$this->fuckhtml
-							->getTextContent(
-								$span[count($span) - 1]
-							)
-						);
-					break;
-				}
-			}
-			
-			$out["news"][] = [
-				"title" => $title,
-				"author" => $author,
-				"description" => $description,
-				"date" => $date,
-				"thumb" => $thumb,
-				"url" =>
-					$this->unshiturl(
-						$a["attributes"]
-						["href"]
-					)
-			];
-		}
-		
-		return $out;
+		throw new Exception("There are no known ways to scrape Google's /search endpoint without JS at this time. I'm working on a method that extracts cookies from browsers. Use Google API/CSE/Yahoo JP/Startpage for google results for now.");
 	}
 	
 	
@@ -1730,6 +599,11 @@ class google{
 				);
 			
 			$params = json_decode($params, true);
+			
+			$page = $params["page"] + 1;
+			$params = $params["params"];
+			$params["async"] = "_fmt:json,p:1,ijn:{$page}";
+			
 		}else{
 			
 			$search = $get["s"];
@@ -1749,9 +623,13 @@ class google{
 			$format = $get["format"];
 			$rights = $get["rights"];
 			
+			$page = 0;
+			
 			$params = [
 				"q" => $search,
-				"udm" => "2" // get images
+				"tbm" => "isch",
+				"asearch" => "isch",
+				"async" => "_fmt:json,p:1,ijn:{$page}", // ijn:0 = page 1
 			];
 			
 			// country (image search uses cr instead of gl)
@@ -1834,13 +712,9 @@ class google{
 				$params["tbs"] = rtrim($params["tbs"], ",");
 			}
 		}
-		/*
-		$handle = fopen("scraper/page.html", "r");
-		$html = fread($handle, filesize("scraper/page.html"));
-		fclose($handle);*/
 		
 		try{
-			$html = 
+			$json = 
 				$this->get(
 					$proxy,
 					"https://www.google.com/search",
@@ -1851,12 +725,28 @@ class google{
 			throw new Exception("Failed to get search page");
 		}
 		
-		$this->fuckhtml->load($html);
+		unset($params["async"]);
+		
+		//$json = file_get_contents("scraper/google.json");
 		
+		// detect captcha
+		$this->fuckhtml->load($json);
 		$this->detect_sorry();
 		
-		// get javascript images
-		$this->scrape_imagearr($html);
+		// remove xssi
+		$json =
+			preg_replace(
+				'/^[^{]*/',
+				"",
+				$json
+			);
+		
+		$json = json_decode($json, true);
+		
+		if($json === null){
+			
+			throw new Exception("Failed to decode JSON");
+		}
 		
 		$out = [
 			"status" => "ok",
@@ -1864,69 +754,42 @@ class google{
 			"image" => []
 		];
 		
-		$images =
-			$this->fuckhtml
-			->getElementsByClassName(
-				"ivg-i",
-				"div"
-			);
-		
-		foreach($images as $div){
-			
-			$this->fuckhtml->load($div);
-			
-			$image =
-				$this->fuckhtml
-				->getElementsByTagName("img")[0];
+		if(!isset($json["ischj"]["metadata"])){
 			
-			// make sure we dont attempt to show an image we dont have data for
-			if(
-				isset($div["attributes"]["data-docid"]) &&
-				isset($this->image_arr[$div["attributes"]["data-docid"]])
-			){
-				
-				$source =
-					$this->image_arr[
-						$div["attributes"]["data-docid"]
-					];
-			}else{
-				
-				continue;
-			}
+			throw new Exception("Google did not return an image array");
+		}
+		
+		foreach($json["ischj"]["metadata"] as $image){
 			
 			$out["image"][] = [
-				"title" =>
-					$this->titledots(
-						$this->fuckhtml
-						->getTextContent(
-							$image["attributes"]["alt"]
-						)
-					),
-				"source" => $source,
-				"url" =>
-					$this->fuckhtml
-					->getTextContent(
-						$div["attributes"]["data-lpage"]
-					)
+				"title" => $this->titledots($image["result"]["page_title"]),
+				"source" => [
+					[
+						"url" => $image["original_image"]["url"],
+						"width" => (int)$image["original_image"]["width"],
+						"height" => (int)$image["original_image"]["height"]
+					],
+					[
+						"url" => $image["thumbnail"]["url"],
+						"width" => (int)$image["thumbnail"]["width"],
+						"height" => (int)$image["thumbnail"]["height"]
+					]
+				],
+				"url" => $image["result"]["referrer_url"]
 			];
 		}
 		
-		// as usual, no way to check if there is a next page reliably
-		if(count($out["image"]) > 50){
-			
-			if(!isset($params["start"])){
-				
-				$params["start"] = 10;
-			}else{
-				
-				$params["start"] += 10;
-			}
+		$page++;
+		
+		if(count($out["image"]) === 10){
 			
 			$out["npt"] =
-				$this->backend
-				->store(
-					json_encode($params),
-					"image",
+				$this->backend->store(
+					json_encode([
+						"params" => $params,
+						"page" => $page
+					]),
+					"images",
 					$proxy
 				);
 		}
author	lolcat <will@lolcat.ca>	2026-03-18 00:01:22 -0400
committer	lolcat <will@lolcat.ca>	2026-03-18 00:01:22 -0400
commit	61548e8b84e948ef71e405a980941a9fc996ae28 (patch)
tree	9bfa394171127357efca7d981b73316f5d4e5fc2 /scraper
parent	2e5edda85b341074afd48a6a3c37ad9e2b249679 (diff)