aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2026-03-05 21:06:19 -0500
committerlolcat <will@lolcat.ca>2026-03-05 21:06:19 -0500
commit2e5edda85b341074afd48a6a3c37ad9e2b249679 (patch)
tree00ac71268d7568a343c7066058471a8267cd8ceb /scraper/google.php
parent4e247c3ac4036495f01d880dabfa36ec0187bf37 (diff)
fix scrape failures on google
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php59
1 files changed, 40 insertions, 19 deletions
diff --git a/scraper/google.php b/scraper/google.php
index 81a5e2d..2f71e0e 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -522,7 +522,7 @@ class google{
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
"User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
- "Accept-Language: nl,en;q=0.8",
+ "Accept-Language: en-US,en;q=0.8",
"Accept-Encoding: gzip, deflate",
"Connection: Keep-Alive",
"Cache-Control: no-cache"
@@ -770,40 +770,61 @@ class google{
$this->fuckhtml->load($container);
// probe for search result
- $sprobe =
+ $title =
$this->fuckhtml
->getElementsByAttributeValue(
"role",
- "presentation",
- "a"
+ "link",
+ "div"
);
- if(count($sprobe) !== 0){
+ if(count($title) !== 0){
// we found a search result
$title =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ );
+
+ // get url
+ $sprobe =
$this->fuckhtml
- ->getElementsByAttributeValue(
- "role",
- "link",
- "div"
+ ->getElementsByTagName(
+ "a"
);
- if(count($title) === 0){
+ $link = null;
+
+ foreach($sprobe as $possible_link){
+
+ if(
+ isset($possible_link["attributes"]["href"]) &&
+ preg_match(
+ '/^\/url\?q=/',
+ $possible_link["attributes"]["href"]
+ )
+ ){
+
+ $link =
+ $this->fuckhtml
+ ->getTextContent(
+ $possible_link["attributes"]["href"]
+ );
+
+ break;
+ }
+ }
+
+ if($link === null){
// should not happen
continue;
}
- $title =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- )
- );
-
// get description
// as usual, theres a thousand fucking possible divs for this one
@@ -968,7 +989,7 @@ class google{
$out["web"][] = [
"title" => $title,
"description" => $description,
- "url" => $this->unshiturl($sprobe[0]["attributes"]["href"]),
+ "url" => $this->unshiturl($link),
"date" => $time,
"type" => "web",
"thumb" => $thumb,
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage