google quote on quote fix

author: lolcat <will@lolcat.ca> 2025-10-08 00:42:36 -0400
committer: lolcat <will@lolcat.ca> 2025-10-08 00:42:36 -0400
commit: a4a44709b4ee1dffaca8b7f79b3c0814914a58f7 (patch)
tree: 1105658aaafdaec94205a3d1c032ba967fd5ac20 /scraper/google.php
parent: 4b16fd58971321e6938046702210e2773573c621 (diff)
1 files changed, 1 insertions, 460 deletions
diff --git a/scraper/google.php b/scraper/google.php
index c83b084..049c844 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -561,466 +561,7 @@ class google{
 	
 	public function web($get){
 		
-		if($get["npt"]){
-			
-			[$params, $proxy] = $this->backend->get($get["npt"], "web");
-			
-			$params = json_decode($params, true);
-			
-			try{
-				$json =
-					$this->get(
-						$proxy,
-						"https://www.google.com/search",
-						$params
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to get HTML");
-			}
-			
-		}else{
-			$search = $get["s"];
-			$country = $get["country"];
-			$nsfw = $get["nsfw"];
-			$lang = $get["lang"];
-			$older = $get["older"];
-			$newer = $get["newer"];
-			$spellcheck = $get["spellcheck"];
-			$proxy = $this->backend->get_ip();
-			
-			$offset = 0;
-			
-			/*
-			https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
-			
-			https://www.google.com/search?udm=14&
-			yv=3&
-			q=asmr&
-			biw=1920&
-			bih=947&
-			start=0&
-			sa=N&
-			asearch=arc&
-			cs=1&
-			async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
-			*/
-			
-			$params = [
-				"udm" => 14,
-				"yv" => 3,
-				"q" => $search,
-				"biw" => 1920,
-				"bih" => 947,
-				"start" => 0,
-				"sa" => "N",
-				"asearch" => "arc",
-				"cs" => 1,
-				"async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc",
-				"hl" => "en",
-				"num" => 20
-			];
-			
-			// country
-			if($country != "any"){
-				
-				$params["gl"] = $country;
-			}
-			
-			// nsfw
-			$params["safe"] = $nsfw == "yes" ? "off" : "active";
-			
-			// language
-			if($lang != "any"){
-				
-				$params["lr"] = "lang_" . $lang;
-			}
-			
-			// generate tbs
-			$tbs = [];
-			
-			// get date
-			$older = $older === false ? null : date("m/d/Y", $older);
-			$newer = $newer === false ? null : date("m/d/Y", $newer);
-			
-			if(
-				$older !== null ||
-				$newer !== null
-			){
-				
-				$tbs["cdr"] = "1";
-				$tbs["cd_min"] = $newer;
-				$tbs["cd_max"] = $older;
-			}
-			
-			// spellcheck filter
-			if($spellcheck == "no"){
-				
-				$params["nfpr"] = "1";
-			}
-			
-			if(count($tbs) !== 0){
-				
-				$params["tbs"] = "";
-				
-				foreach($tbs as $key => $value){
-					
-					$params["tbs"] .= $key . ":" . $value . ",";
-				}
-				
-				$params["tbs"] = rtrim($params["tbs"], ",");
-			}
-			
-			try{
-				$json =
-					$this->get(
-						$proxy,
-						"https://www.google.com/search",
-						$params
-					);
-			}catch(Exception $error){
-				
-				throw new Exception("Failed to get HTML");
-			}
-			
-			//$json = file_get_contents("scraper/google.js");
-		}
-		
-		$out = [
-			"status" => "ok",
-			"spelling" => [
-				"type" => "no_correction",
-				"using" => null,
-				"correction" => null
-			],
-			"npt" => null,
-			"answer" => [],
-			"web" => [],
-			"image" => [],
-			"video" => [],
-			"news" => [],
-			"related" => []
-		];
-		
-		$this->fuckhtml->load($json);
-		$this->detect_sorry();
-		
-		// get next page
-		/*
-		$npt =
-			$this->fuckhtml
-			->getElementsByAttributeName(
-				"data-state-token",
-				"div"
-			);
-		
-		if(count($npt) !== 0){
-			
-			$params["sstk"] =
-				$this->fuckhtml
-				->getTextContent(
-					$npt[0]["attributes"]["data-state-token"]
-				);
-			
-			$params["start"] += 10;
-			
-			$out["npt"] =
-				$this->backend->store(
-					json_encode($params),
-					"web",
-					$proxy
-				);
-		}*/
-		
-		// get invididual results
-		$results =
-			$this->fuckhtml
-			->getElementsByAttributeName(
-				"data-hveid",
-				"div"
-			);
-		
-		foreach($results as $result){
-			
-			$this->fuckhtml->load($result);
-			
-			//echo $result["innerHTML"];
-			
-			$snfs =
-				$this->fuckhtml
-				->getElementsByAttributeName(
-					"data-snf",
-					"div"
-				);
-			
-			$title = null;
-			$description = null;
-			$link = null;
-			$sublinks = [];
-			$date = null;			
-			$thumb = [
-				"ratio" => null,
-				"url" => null
-			];
-			$table = [];
-			
-			// probe for title
-			$title_node =
-				$this->fuckhtml
-				->getElementsByTagName(
-					"h3"
-				);
-			
-			if(count($title_node) !== 0){
-				
-				// found a title node
-				$title =
-					$this->fuckhtml
-					->getTextContent(
-						$title_node[0]
-					);
-			}
-			
-			if($title === null){
-				
-				// should not happen
-				continue;
-			}
-			
-			foreach($snfs as $snf){
-				
-				$this->fuckhtml->load($snf);
-				
-				// probe for thumbnail
-				$thumbnail =
-					$this->fuckhtml
-					->getElementsByAttributeName(
-						"alt",
-						"img"
-					);
-				
-				foreach($thumbnail as $t){
-					
-					if(
-						isset($t["attributes"]["style"]) &&
-						preg_match(
-							'/height ?: ?([0-9]+)px/',
-							$t["attributes"]["style"],
-							$match
-						) &&
-						(int)$match[1] < 40
-					){
-						
-						// found a favicon, ignore
-						continue;
-					}
-					
-					$thumb = [
-						"ratio" => "1:1",
-						"url" =>
-							$this->fuckhtml
-							->getTextContent(
-								$thumbnail[0]["attributes"]["src"]
-							)
-					];
-					
-					continue 2;
-				}
-				
-				// probe for description
-				if($description === null){
-					
-					// probe 1
-					if(
-						isset($snf["attributes"]["data-sncf"]) &&
-						$snf["attributes"]["data-sncf"] == "1,2"
-					){
-						
-						$description =
-							$this->fuckhtml
-							->getTextContent(
-								$snf
-							);
-						continue;
-					}
-					
-					// probe 2
-					$desc_probe =
-						$this->fuckhtml
-						->getElementsByAttributeValue(
-							"style",
-							"-webkit-line-clamp:2",
-							"div"
-						);
-					
-					if(count($desc_probe) !== 0){
-						
-						$description =
-							$this->fuckhtml
-							->getTextContent(
-								$desc_probe[0]
-							);
-						continue;
-					}
-				}
-				
-				// probe for links
-				$links =
-					$this->fuckhtml
-					->getElementsByAttributeName(
-						"data-sb",
-						"a"
-					);
-				
-				if(isset($links[0]["attributes"]["data-ved"])){
-					
-					// found the page link
-					$link =
-						$this->fuckhtml
-						->getTextContent(
-							$links[0]["attributes"]["href"]
-						);
-					
-					continue;
-				}
-				
-				if(count($links) !== 0){
-					
-					// get all sublinks
-					for($i=0; $i<count($links); $i++){
-						
-						$sublinks[] = [
-							"title" =>
-								$this->titledots(
-									$this->fuckhtml
-									->getTextContent(
-										$links[$i]
-									)
-								),
-							"description" => null,
-							"date" => null,
-							"url" =>
-								$this->fuckhtml
-								->getTextContent(
-									$links[$i]["attributes"]["href"]
-								)
-						];
-					}
-					
-					continue;
-				}
-				
-				// get tabloid-able data
-				$tabloid =
-					$this->fuckhtml
-					->getElementsByAttributeValue(
-						"style",
-						"margin-top:0px",
-						"div"
-					);
-				
-				if(count($tabloid) === 0){
-					
-					// try getting <cite> instead
-					$tabloid =
-						$this->fuckhtml
-						->getElementsByTagName(
-							"cite"
-						);
-				}
-				
-				if(count($tabloid) !== 0){
-					
-					// found table
-					$tabloid =
-						explode("·", $tabloid[0]["innerHTML"]);
-					
-					foreach($tabloid as $tbl){
-						
-						$preg =
-							$this->fuckhtml
-							->getTextContent(
-								$tbl
-							);
-						
-						//$table[random_int(0,1000)] = $preg;
-						
-						if(
-							// match price
-							preg_match(
-								'/(\p{Sc}[^\p{Sc}]+)/',
-								$preg,
-								$match
-							)
-						){
-							
-							$table["Price"] = trim($match[1]);
-						}
-						
-						if(
-							// match in stock/delivery
-							preg_match(
-								'/(stock|delivery|returns)/i',
-								$preg,
-								$match
-							)
-						){
-								
-							$table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0");
-						}
-					}
-					continue;
-				}
-			}
-			
-			// extract date from description
-			$description_split =
-				explode(
-					"—", $description, 2
-				);
-			
-			if(count($description_split) === 1){
-				
-				$description = $description_split[0];
-			}elseif(strlen($description_split[0]) < 17){
-				
-				$date = strtotime($description_split[0]);
-				
-				if($date !== false){
-					
-					$description = $description_split[1];
-				}else{
-					
-					$date = null;
-				}
-			}
-			
-			$out["web"][] = [
-				"title" => $this->titledots($title),
-				"description" => $this->titledots($description),
-				"url" => $link,
-				"date" => $date,
-				"type" => "web",
-				"thumb" => $thumb,
-				"sublink" => $sublinks,
-				"table" => $table
-			];
-		}
-		
-		// get next page
-		if(count($out["web"]) > 5){
-			
-			$params["start"] += 10;
-			
-			$out["npt"] =
-				$this->backend->store(
-					json_encode($params),
-					"web",
-					$proxy
-				);
-		}
-		
-		return $out;
+		throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers.");
 	}
author	lolcat <will@lolcat.ca>	2025-10-08 00:42:36 -0400
committer	lolcat <will@lolcat.ca>	2025-10-08 00:42:36 -0400
commit	a4a44709b4ee1dffaca8b7f79b3c0814914a58f7 (patch)
tree	1105658aaafdaec94205a3d1c032ba967fd5ac20 /scraper/google.php
parent	4b16fd58971321e6938046702210e2773573c621 (diff)