aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2026-03-04 02:07:43 -0500
committerlolcat <will@lolcat.ca>2026-03-04 02:07:43 -0500
commitf324ddd3a7532c1ed40b265ad3db9da4e6ac94a9 (patch)
tree102cb7b9ceadcea849b42d5b9826bb808c3596d5 /scraper/google.php
parentcdf9164113a3cef01b4f26f6e02887a3e3c91267 (diff)
the excuse of a search engine broke my scraper again. go back to generating slop you waste of oxygen
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php534
1 files changed, 283 insertions, 251 deletions
diff --git a/scraper/google.php b/scraper/google.php
index 03fa718..a3e76e8 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -520,7 +520,7 @@ class google{
if($alt_ua === true){
curl_setopt($curlproc, CURLOPT_HTTPHEADER, [
- "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows CE; PPC; 240x320) Opera 8.65 [nl]",
+ "User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 26_0_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) GSA/406.0.862495628 Mobile/15E148 Safari/604.1",
"Accept: text/html, application/xml;q=0.9, */*;q=0.8",
"Accept-Language: nl,en;q=0.8",
"Accept-Encoding: gzip, deflate",
@@ -573,10 +573,8 @@ class google{
public function web($get){
- // this is going to break soon. I wont scrape the answers simply cause its not worth my time.
- // If only their API wasn't such dogshit maybe I wouldnt need to fuck with this. this isn't
- // just a rant. I know a Google engineer is reading this, give me real fucking results
- // you worthless sacks of shit
+ // it broke again. lasted 3 months
+ // lets hope for another solid 3 month
$out = [
"status" => "ok",
@@ -690,6 +688,7 @@ class google{
throw new Exception("Failed to get HTML");
}
+
//$html = file_get_contents("scraper/google.html");
}
@@ -698,25 +697,22 @@ class google{
$this->detect_sorry();
$this->parsestyles();
- // iterate over results
- $containers =
+ // get javascript images
+ $this->scrape_dimg($html);
+ $this->scrape_imagearr($html);
+
+ // get next page
+ $npt =
$this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "background-color" => "#fff",
- "margin-bottom" => "10px",
- "margin" => "0px 0px 8px",
- "box-shadow" => "0 0 0 1px #ebedef"
- ])
+ ->getElementsByAttributeValue(
+ "aria-label",
+ "More search results",
+ "a"
);
- foreach($containers as $container){
-
- $this->fuckhtml->load($container);
+ if(count($npt) === 0){
- //
- // Probe for next page container
- //
+ // maybe we have the npt object from 2nd page, probe for that
$npt =
$this->fuckhtml
->getElementsByAttributeValue(
@@ -724,283 +720,317 @@ class google{
"Next page",
"a"
);
+ }
+
+ if(count($npt) !== 0){
- if(count($npt) !== 0){
-
- // found next page object
- $out["npt"] =
- $this->backend->store(
- $this->fuckhtml
- ->getTextContent(
- $npt[0]
- ["attributes"]
- ["href"]
- ),
- "web",
- $proxy
- );
- continue;
- }
+ $out["npt"] =
+ $this->backend->store(
+ $this->fuckhtml
+ ->getTextContent(
+ $npt[0]["attributes"]["href"]
+ ),
+ "web",
+ $proxy
+ );
+ }
+
+ // outer div is .MjjYud
+ // inner div always contain role="presentation"
+
+ $outer =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "MjjYud",
+ "div"
+ );
+
+ // used later
+ $fancycontainer_class =
+ explode(
+ " ",
+ $this->getstyle([
+ "padding-top" => "4px",
+ "padding-bottom" => "calc(12px*1)"
+ ]),
+ 2
+ );
+
+ if(count($fancycontainer_class) === 2){
- //
- // Probe for "did you mean" bullshit
- //
- $ddm =
+ $fancycontainer_class = $fancycontainer_class[1];
+ }else{
+
+ $fancycontainer_class = false;
+ }
+
+ foreach($outer as $container){
+
+ $this->fuckhtml->load($container);
+
+ // probe for search result
+ $sprobe =
$this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "font-size" => "20px",
- "font-weight" => "bold",
- "line-height" => "26px",
- "color" => "#1f1f1f",
- "height" => "14px",
- "padding" => "16px 14px 0px 14px",
- "margin" => "0"
- ])
+ ->getElementsByAttributeValue(
+ "role",
+ "presentation",
+ "a"
);
- if(
- count($ddm) !== 0 &&
- strtolower(
- $this->fuckhtml
- ->getTextContent(
- $ddm[0]
- )
- ) == "people also search for"
- ){
+ if(count($sprobe) !== 0){
- $as =
+ // we found a search result
+
+ $title =
$this->fuckhtml
- ->getElementsByTagName("a");
+ ->getElementsByAttributeValue(
+ "role",
+ "link",
+ "div"
+ );
- foreach($as as $a){
+ if(count($title) === 0){
- $out["related"][] =
+ // should not happen
+ continue;
+ }
+
+ $title =
+ $this->titledots(
$this->fuckhtml
->getTextContent(
- $a
- );
- }
- continue;
- }
-
- //
- // Parse normal web results
- //
-
- // probe for website ellipsis shit
- $ellipsis =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "text-overflow" => "ellipsis",
- "white-space" => "nowrap",
- "overflow" => "hidden"
- ])
- );
-
- if(count($ellipsis) < 1){
+ $title[0]
+ )
+ );
- // should not happen
- continue;
- }
-
- $title =
- $this->fuckhtml
- ->getElementsByTagName(
- "h3"
- );
-
- if(count($title) === 0){
+ // get description
+ // as usual, theres a thousand fucking possible divs for this one
- // should not happen
- continue;
- }
-
- $title =
- $this->fuckhtml
- ->getTextContent(
- $title[0]
- );
-
- // get URL
- $as =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- );
-
- if(count($as) === 0){
+ // probe for youtube-like description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "align-items" => "flex-start",
+ "display" => "flex",
+ "justify-content" => "center",
+ "padding" => "7px 12px",
+ "padding-right" => "0",
+ "padding-top" => "0"
+ ]),
+ "div"
+ );
- // should not happen
- continue;
- }
-
- $link =
- $this->unshiturl(
- $as[0]
- ["attributes"]
- ["href"]
- );
-
- // grep container separators
- $separator =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "padding" => "16px 14px 12px"
- ])
- );
-
- if(count($separator) < 2){
+ $ratio = "16:9";
- // should not happen
- continue;
- }
-
- $this->fuckhtml->load($separator[1]);
-
- $snippets =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "white-space" => "pre-line",
- "word-wrap" => "break-word"
- ])
- );
-
- if(count($snippets) < 2){
+ if(count($description) === 0){
+
+ // fail. find the one with the image on the right handside
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "style",
+ "padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:12px",
+ "div"
+ );
+
+ $ratio = "1:1";
+
+ if(count($description) === 0){
+
+ // fail. find the one that is used the most
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "style",
+ "-webkit-line-clamp:3",
+ "div"
+ );
+
+ if(count($description) === 0){
+
+ // last fail. this one appears with divs that have prices
+ $description =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "style",
+ "max-width:100vw;grid-area:nke7rc;padding-top:2px;padding-right:8px;padding-left:16px;padding-bottom:6px",
+ "div"
+ );
+ }
+ }
+ }
- // should not happen
- continue;
- }
-
- // get description
- $description =
- $this->fuckhtml
- ->getTextContent(
- $snippets[1]
- );
-
- // get date from description
- $exp_description = explode(" · ", $description, 2);
- $date = null;
-
- if(count($exp_description) === 1){
+ if(count($description) === 0){
+
+ // should not happen but whatever
+ $description = null;
+ }else{
+
+ $description =
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ );
+ }
- $description = $exp_description[0];
- }else{
+ // probe for date
+ $desc2 = explode("—", $description, 2);
- $date_probe = strtotime($exp_description[0]);
+ $time = null;
- if(
- strlen($exp_description[0]) <= 17 &&
- $date_probe !== false
- ){
+ if(count($desc2) === 2){
- $date = $date_probe;
- $description = $exp_description[1];
+ $time = strtotime($desc2[0]);
+
+ if(
+ strlen($desc2[0]) < 16 &&
+ $time !== false
+ ){
+
+ $description = ltrim($desc2[1]);
+ }else{
+
+ $time = null;
+ }
}
- }
-
- // get thumb
- $thumb_probe =
- $this->fuckhtml
- ->getElementsByTagName(
- "img"
- );
-
- // too lazy to fix this piece of shit
- // will probably break soon anyways idgaf
- /*
- if(count($thumb_probe) === 0){
$thumb = [
"ratio" => null,
- "url" => null
- ];
- }else{
-
- $thumb = [
- "ratio" => "1:1",
- "url" =>
- $this->getdimg(
- $thumb_probe[0]
- ["attributes"]
- ["id"]
- )
+ "url" => null,
];
- }*/
-
- $thumb = [
- "ratio" => null,
- "url" => null
- ];
-
- // get sublinks
- $sublinks = [];
- foreach($as as $a){
-
- $this->fuckhtml->load($a);
- $probe =
+ // get thumbnail
+ $images =
$this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle([
- "color" => "#1558d6",
- "font-size" => "14px",
- "line-height" => "20px"
- ])
+ ->getElementsByTagName(
+ "img"
);
- $url =
- $this->unshiturl(
- $a["attributes"]["href"]
- );
+ foreach($images as $image){
+
+ if(isset($image["attributes"]["id"])){
+
+ $thumb = [
+ "ratio" => $ratio,
+ "url" => $this->getdimg($image["attributes"]["id"])
+ ];
+ }
+ }
+
+ // get sublinks
+ $sublinks = [];
+
+ // probe for the fancy version
+ if($fancycontainer_class !== false){
+ $fancycontainer =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $fancycontainer_class,
+ "div"
+ );
+ }
if(
- preg_match(
- '/^http/',
- $url
- )
+ $fancycontainer_class !== false &&
+ count($fancycontainer) !== 0
){
- if(count($probe) !== 0){
+ $this->fuckhtml->load($fancycontainer[0]);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ foreach($as as $a){
$sublinks[] = [
"title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $probe[0]
- )
+ $this->fuckhtml
+ ->getTextContent(
+ $a
),
"description" => null,
"date" => null,
- "url" => $url
+ "url" =>
+ $this->unshiturl(
+ $a["attributes"]["href"]
+ )
];
}
}
+
+ $out["web"][] = [
+ "title" => $title,
+ "description" => $description,
+ "url" => $this->unshiturl($sprobe[0]["attributes"]["href"]),
+ "date" => $time,
+ "type" => "web",
+ "thumb" => $thumb,
+ "sublink" => $sublinks,
+ "table" => []
+ ];
+ continue;
}
- $out["web"][] = [
- "title" =>
- $this->titledots(
- $title
- ),
- "description" =>
- $this->titledots(
- $description
- ),
- "url" => $link,
- "date" => $date,
- "type" => "web",
- "thumb" => $thumb,
- "sublink" => $sublinks,
- "table" => []
- ];
+ // probe for containers with a title header
+ $title_header =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "display" => "flex",
+ "flex-wrap" => "wrap",
+ "position" => "relative",
+ "padding" => "16px"
+ ])
+ );
+
+ if(count($title_header) !== 0){
+
+ $title_header =
+ strtolower(
+ $this->fuckhtml
+ ->getTextContent(
+ $title_header[0]
+ )
+ );
+
+ switch($title_header){
+
+ case "people also search for":
+ // get all related searches
+ $relateds =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "display" => "flex",
+ "height" => "100%",
+ "flex-direction" => "column",
+ "max-width" => "100%"
+ ])
+ );
+
+ foreach($relateds as $r){
+
+ $out["related"][] =
+ $this->fuckhtml
+ ->getTextContent(
+ $r
+ );
+ }
+ break;
+ }
+
+ continue;
+ }
}
+ $out["related"] = array_values(array_unique($out["related"]));
+
return $out;
}
@@ -1324,6 +1354,8 @@ class google{
public function news($get){
+ throw new Exception("Broke for now, fuck off lol");
+
if($get["npt"]){
[$req, $proxy] = $this->backend->get($get["npt"], "news");
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage