diff options
| author | lolcat <will@lolcat.ca> | 2026-03-05 21:06:19 -0500 |
|---|---|---|
| committer | lolcat <will@lolcat.ca> | 2026-03-05 21:06:19 -0500 |
| commit | 2e5edda85b341074afd48a6a3c37ad9e2b249679 (patch) | |
| tree | 00ac71268d7568a343c7066058471a8267cd8ceb /scraper/google.php | |
| parent | 4e247c3ac4036495f01d880dabfa36ec0187bf37 (diff) | |
fix scrape failures on google
Diffstat (limited to 'scraper/google.php')
| -rw-r--r-- | scraper/google.php | 59 |
1 files changed, 40 insertions, 19 deletions
diff --git a/scraper/google.php b/scraper/google.php index 81a5e2d..2f71e0e 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -522,7 +522,7 @@ class google{ curl_setopt($curlproc, CURLOPT_HTTPHEADER, [ "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1", "Accept: text/html, application/xml;q=0.9, */*;q=0.8", - "Accept-Language: nl,en;q=0.8", + "Accept-Language: en-US,en;q=0.8", "Accept-Encoding: gzip, deflate", "Connection: Keep-Alive", "Cache-Control: no-cache" @@ -770,40 +770,61 @@ class google{ $this->fuckhtml->load($container); // probe for search result - $sprobe = + $title = $this->fuckhtml ->getElementsByAttributeValue( "role", - "presentation", - "a" + "link", + "div" ); - if(count($sprobe) !== 0){ + if(count($title) !== 0){ // we found a search result $title = + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ); + + // get url + $sprobe = $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "link", - "div" + ->getElementsByTagName( + "a" ); - if(count($title) === 0){ + $link = null; + + foreach($sprobe as $possible_link){ + + if( + isset($possible_link["attributes"]["href"]) && + preg_match( + '/^\/url\?q=/', + $possible_link["attributes"]["href"] + ) + ){ + + $link = + $this->fuckhtml + ->getTextContent( + $possible_link["attributes"]["href"] + ); + + break; + } + } + + if($link === null){ // should not happen continue; } - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - // get description // as usual, theres a thousand fucking possible divs for this one @@ -968,7 +989,7 @@ class google{ $out["web"][] = [ "title" => $title, "description" => $description, - "url" => $this->unshiturl($sprobe[0]["attributes"]["href"]), + "url" => $this->unshiturl($link), "date" => $time, "type" => "web", "thumb" => $thumb, |
