aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-04-02 21:40:53 -0400
committerlolcat <will@lolcat.ca>2025-04-02 21:40:53 -0400
commit3e2c3fc5d90a0b0f859358d1c5b2614ab26905a4 (patch)
tree667fd0956167c10f86251e276a3a12224b5d130b /scraper/google.php
parent49ddd1a216d3d885081f28d57e32dc6abac3f7ff (diff)
fixed google videos
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php913
1 files changed, 207 insertions, 706 deletions
diff --git a/scraper/google.php b/scraper/google.php
index d2c9eda..b3b3b13 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -578,697 +578,6 @@ class google{
}
- private function parsepage($html, $pagetype, $search, $proxy, $params){
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => null,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- $this->fuckhtml->load($html);
-
- $this->detect_sorry();
-
- // parse all <style> tags
- $this->parsestyles();
-
- // get javascript images
- $this->scrape_dimg($html);
-
- // get html blobs
- preg_match_all(
- '/function\(\){window\.jsl\.dh\(\'([^\']+?)\',\'(.+?[^\'])\'\);/',
- $html,
- $blobs
- );
-
- $this->blobs = [];
- if(isset($blobs[1])){
-
- for($i=0; $i<count($blobs[1]); $i++){
-
- $this->blobs[$blobs[1][$i]] =
- $this->fuckhtml
- ->parseJsString(
- $blobs[2][$i]
- );
- }
- }
-
- $this->scrape_imagearr($html);
-
- //
- // load result column
- //
-
- $result_div =
- $this->fuckhtml
- ->getElementById(
- "center_col",
- "div"
- );
-
- if($result_div === false){
-
- throw new Exception("Failed to grep result div");
- }
-
- $this->fuckhtml->load($result_div);
-
- // important for later
- $last_page = false;
-
- //
- // Get text results
- //
- $results =
- $this->fuckhtml
- ->getElementsByClassName(
- "g",
- "div"
- );
-
- $this->skip_next = false;
-
- foreach($results as $result){
-
- if($this->skip_next){
-
- $this->skip_next = false;
- continue;
- }
-
- $this->fuckhtml->load($result);
-
- $web = [
- "title" => null,
- "description" => null,
- "url" => null,
- "date" => null,
- "type" => "web",
- "thumb" => [
- "url" => null,
- "ratio" => null
- ],
- "sublink" => [],
- "table" => []
- ];
-
- // Detect presence of sublinks
- $g =
- $this->fuckhtml
- ->getElementsByClassName(
- "g",
- "div"
- );
-
- if(count($g) > 0){
-
- // skip on next iteration
- $this->skip_next = true;
- }
-
- // get title
- $h3 =
- $this->fuckhtml
- ->getElementsByTagName(
- "h3"
- );
-
- if(count($h3) === 0){
-
- continue;
- }
-
- $web["title"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $h3[0]
- )
- );
-
- // get url
- $as =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- );
-
- $web["url"] =
- $this->unshiturl(
- $as[0]
- ["attributes"]
- ["href"]
- );
-
- if(
- !preg_match(
- '/^http/',
- $web["url"]
- )
- ){
-
- // skip if invalid url is found
- continue;
- }
-
- //
- // get viewcount, time posted and follower count from <cite> tag
- //
- $cite =
- $this->fuckhtml
- ->getElementsByTagName(
- "cite"
- );
-
- if(count($cite) !== 0){
-
- $this->fuckhtml->load($cite[0]);
-
- $spans =
- $this->fuckhtml
- ->getElementsByTagName("span");
-
- if(count($spans) === 0){
-
- $cites =
- explode(
- "·",
- $this->fuckhtml
- ->getTextContent(
- $cite[0]
- )
- );
-
- foreach($cites as $cite){
-
- $cite = trim($cite);
-
- if(
- preg_match(
- '/(.+) (views|followers|likes)$/',
- $cite,
- $match
- )
- ){
-
- $web["table"][ucfirst($match[2])] =
- $match[1];
- }elseif(
- preg_match(
- '/ago$/',
- $cite
- )
- ){
-
- $web["date"] =
- strtotime($cite);
- }
- }
- }
-
- // reset
- $this->fuckhtml->load($result);
- }
-
- //
- // attempt to fetch description cleanly
- //
- $description =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "style",
- "-webkit-line-clamp:2"
- );
-
- if(count($description) !== 0){
-
- $web["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- );
- }else{
-
- // use ANOTHER method where the description is a header of the result
- $description =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "data-attrid",
- "wa:/description"
- );
-
- if(count($description) !== 0){
-
- // get date off that shit
- $date =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "font-size" => "12px",
- "line-height" => "1.34",
- "display" => "inline-block",
- "font-family" => "google sans,arial,sans-serif",
- "padding-right" => "0",
- "white-space" => "nowrap"
- ]
- ),
- "span"
- );
-
- if(count($date) !== 0){
-
- $description[0]["innerHTML"] =
- str_replace(
- $date[0]["outerHTML"],
- "",
- $description[0]["innerHTML"]
- );
-
- $web["date"] =
- strtotime(
- $this->fuckhtml
- ->getTextContent(
- $date[0]
- )
- );
- }
-
- $web["description"] =
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- );
- }else{
-
- // Yes.. You guessed it, use ANOTHER method to get descriptions
- // off youtube containers
- $description =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "-webkit-box-orient" => "vertical",
- "display" => "-webkit-box",
- "font-size" => "14px",
- "-webkit-line-clamp" => "2",
- "line-height" => "22px",
- "overflow" => "hidden",
- "word-break" => "break-word",
- "color" => "#4d5156"
- ]
- ),
- "div"
- );
-
- if(count($description) !== 0){
-
- // check for video duration
- $duration =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "background-color" => "rgba(0,0,0,0.6)",
- "color" => "#fff",
- "fill" => "#fff"
- ]
- ),
- "div"
- );
-
- if(count($duration) !== 0){
-
- $web["table"]["Duration"] =
- $this->fuckhtml
- ->getTextContent(
- $duration[0]
- );
- }
-
- $web["description"] =
- $this->titledots(
- html_entity_decode(
- $this->fuckhtml
- ->getTextContent(
- $description[0]
- )
- )
- );
-
- // get author + time posted
- $info =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "color" => "var(" . $this->getcolorvar("#70757a") . ")",
- "font-size" => "14px",
- "line-height" => "20px",
- "margin-top" => "12px"
- ]
- ),
- "div"
- );
-
- if(count($info) !== 0){
-
- $info =
- explode(
- "·",
- $this->fuckhtml
- ->getTextContent(
- $info[0]
- )
- );
-
- switch(count($info)){
-
- case 3:
- $web["table"]["Author"] = trim($info[1]);
- $web["date"] = strtotime(trim($info[2]));
- break;
-
- case 2:
- $web["date"] = strtotime(trim($info[1]));
- break;
- }
- }
- }
- }
- }
-
- //
- // get categories of content within the search result
- //
- $cats =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-sncf",
- "div"
- );
-
- foreach($cats as $cat){
-
- $this->fuckhtml->load($cat);
-
- // detect image category
- $images =
- $this->fuckhtml
- ->getElementsByTagName(
- "img"
- );
-
- if(count($images) !== 0){
-
- foreach($images as $image){
-
- if(isset($image["attributes"]["id"])){
- // we found an image
-
- if(isset($image["attributes"]["width"])){
-
- $width = (int)$image["attributes"]["width"];
-
- if($width == 110){
-
- $ratio = "1:1";
- }elseif($width > 110){
-
- $ratio = "16:9";
- }else{
-
- $ratio = "9:16";
- }
- }else{
-
- $ratio = "1:1";
- }
-
- $web["thumb"] = [
- "url" => $this->getdimg($image["attributes"]["id"]),
- "ratio" => $ratio
- ];
-
- continue 2;
- }
- }
- }
-
- // Detect rating
- $spans_unfiltered =
- $this->fuckhtml
- ->getElementsByTagName(
- "span"
- );
-
- $spans =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "aria-label",
- $spans_unfiltered
- );
-
- foreach($spans as $span){
-
- if(
- preg_match(
- '/^Rated/',
- $span["attributes"]["aria-label"]
- )
- ){
-
- // found rating
- // scrape rating
- preg_match(
- '/([0-9.]+).*([0-9.]+)/',
- $span["attributes"]["aria-label"],
- $rating
- );
-
- if(isset($rating[1])){
-
- $web["table"]["Rating"] =
- $rating[1] . "/" . $rating[2];
- }
-
- $has_seen_reviews = 0;
- foreach($spans_unfiltered as $span_unfiltered){
-
- if(
- preg_match(
- '/([0-9,.]+) +([A-z]+)$/',
- $this->fuckhtml
- ->getTextContent(
- $span_unfiltered
- ),
- $votes
- )
- ){
-
- $has_seen_reviews++;
- $web["table"][ucfirst($votes[2])] = $votes[1];
- continue;
- }
-
- $text =
- $this->fuckhtml
- ->getTextContent(
- $span_unfiltered
- );
-
- if(
- $text == "&nbsp;&nbsp;&nbsp;" ||
- $text == ""
- ){
-
- break;
- }
-
- switch($has_seen_reviews){
-
- case 1:
- // scrape price
- $web["table"]["Price"] = $text;
- $has_seen_reviews++;
- break;
-
- case 2:
- // scrape platform
- $web["table"]["Platform"] = $text;
- $has_seen_reviews++;
- break;
-
- case 3:
- // Scrape type
- $web["table"]["Medium"] = $text;
- break;
- }
- }
-
- continue 2;
- }
- }
-
- // check if its an answer header
- $answer_header =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "overflow" => "hidden",
- "text-overflow" => "ellipsis"
- ]
- ),
- "span"
- );
-
- if(count($answer_header) !== 0){
-
- $link =
- $this->fuckhtml
- ->getElementsByTagName(
- "a"
- );
-
- $cat["innerHTML"] =
- str_replace(
- $link[0]["outerHTML"],
- "",
- $cat["innerHTML"]
- );
-
- continue;
- }
-
- // we probed everything, assume this is the description
- // if we didn't find one cleanly previously
- if($web["description"] === null){
- $web["description"] =
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $cat
- )
- );
- }
- }
-
- // check if description contains date
- $description = explode("—", $web["description"], 2);
-
- if(
- count($description) === 2 &&
- strlen($description[0]) <= 20
- ){
-
- $date = strtotime($description[0]);
-
- if($date !== false){
-
- $web["date"] = $date;
- $web["description"] = ltrim($description[1]);
- }
- }
-
- // fetch youtube thumbnail
- $thumbnail =
- $this->fuckhtml
- ->getElementsByClassName(
- $this->getstyle(
- [
- "border-radius" => "8px",
- "height" => "fit-content",
- "justify-content" => "center",
- "margin-right" => "20px",
- "margin-top" => "4px",
- "position" => "relative",
- "width" => "fit-content"
- ]
- ),
- "div"
- );
-
- if(count($thumbnail) !== 0){
-
- // load thumbnail container
- $this->fuckhtml->load($thumbnail[0]);
-
- $image =
- $this->fuckhtml
- ->getElementsByTagName(
- "img"
- );
-
- if(
- count($image) !== 0 &&
- isset($image[0]["attributes"]["id"])
- ){
-
- $web["thumb"] = [
- "url" =>
- $this->unshit_thumb(
- $this->getdimg(
- $image[0]["attributes"]["id"]
- )
- ),
- "ratio" => "16:9"
- ];
- }
-
- // reset
- $this->fuckhtml->load($result);
- }
-
- $out["web"][] = $web;
- }
-
- // reset
- $this->fuckhtml->load($result_div);
-
- //
- // craft $npt token
- //
- if(
- $last_page === false &&
- count($out["web"]) !== 0
- ){
- if(!isset($params["start"])){
-
- $params["start"] = 20;
- }else{
-
- $params["start"] += 20;
- }
-
- $out["npt"] =
- $this->backend
- ->store(
- json_encode($params),
- $pagetype,
- $proxy
- );
- }
-
- return $out;
- }
-
-
private function scrape_dimg($html){
// get images loaded through javascript
@@ -2554,8 +1863,6 @@ class google{
[$params, $proxy] = $this->backend->get($get["npt"], "video");
$params = json_decode($params, true);
- $search = $params["q"];
-
}else{
$search = $get["s"];
$country = $get["country"];
@@ -2569,9 +1876,9 @@ class google{
$params = [
"q" => $search,
- "tbm" => "vid",
+ "udm" => "7",
"hl" => "en",
- "num" => "20"
+ "num" => 20
];
// country
@@ -2637,12 +1944,35 @@ class google{
throw new Exception("Failed to get HTML");
}
- //$html = file_get_contents("scraper/google.html");
+ if(!isset($params["start"])){
+
+ $params["start"] = 0;
+ }
+ $params["start"] += 20;
+
+ $this->fuckhtml->load($html);
+
+ //
+ // Parse web video page
+ //
+ $this->detect_sorry();
+
+ // parse all <style> tags
+ $this->parsestyles();
+
+ // get javascript images
+ $this->scrape_dimg($html);
+
+ $this->scrape_imagearr($html);
- $response = $this->parsepage($html, "videos", $search, $proxy, $params);
$out = [
"status" => "ok",
- "npt" => $response["npt"],
+ "npt" =>
+ $this->backend->store(
+ json_encode($params),
+ "videos",
+ $proxy
+ ),
"video" => [],
"author" => [],
"livestream" => [],
@@ -2650,21 +1980,192 @@ class google{
"reel" => []
];
- foreach($response["web"] as $result){
+ $search_div =
+ $this->fuckhtml
+ ->getElementById(
+ "center_col"
+ );
+
+ if($search_div === false){
+
+ throw new Exception("Failed to grep search div");
+ }
+
+ $this->fuckhtml->load($search_div);
+
+ $results =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "margin" => "0px 0px 30px"
+ ]),
+ "div"
+ );
+
+ foreach($results as $result){
+
+ $this->fuckhtml->load($result);
+
+ $url =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "a"
+ );
+
+ if(count($url) === 0){
+
+ // no url, weird, continue
+ continue;
+ }
+
+ $title =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "h3"
+ );
+
+ if(count($title) === 0){
+
+ // no title, weird, continue
+ continue;
+ }
+
+ // get description
+ $description =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "-webkit-box-orient" => "vertical",
+ "display" => "-webkit-box",
+ "-webkit-line-clamp" => "2",
+ "overflow" => "hidden",
+ "word-break" => "break-word"
+ ]),
+ "div"
+ );
+
+ if(count($description) === 0){
+
+ $description = null;
+ }else{
+
+ $description =
+ html_entity_decode(
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $description[0]
+ )
+ )
+ );
+ }
+
+ // get author + date posted
+ $metadiv =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "margin-top" => "12px"
+ ]),
+ "div"
+ );
+
+ $author = null;
+ $date = null;
+
+ if(count($metadiv) !== 0){
+
+ $metadiv =
+ explode(
+ "·",
+ $this->fuckhtml
+ ->getTextContent(
+ $metadiv[0]
+ )
+ );
+
+ if(count($metadiv) === 3){
+
+ $author = trim($metadiv[1]);
+ $date = strtotime(trim($metadiv[2]));
+ }elseif(count($metadiv) === 2){
+
+ $author = trim($metadiv[0]);
+ $date = strtotime(trim($metadiv[1]));
+ }
+ }
+
+ $thumb = [
+ "url" => null,
+ "ratio" => null
+ ];
+
+ $image =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "img"
+ );
+
+ $duration = null;
+
+ if(
+ count($image) !== 0 &&
+ isset($image[0]["attributes"]["id"])
+ ){
+
+ $thumb = [
+ "url" => $this->getdimg($image[0]["attributes"]["id"]),
+ "ratio" => "16:9"
+ ];
+
+ // get duration
+ $duration =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ $this->getstyle([
+ "background-color" => "rgba(0,0,0,0.6)",
+ "color" => "#fff",
+ "fill" => "#fff"
+ ])
+ );
+
+ if(count($duration) !== 0){
+
+ $duration =
+ $this->hms2int(
+ $this->fuckhtml
+ ->getTextContent(
+ $duration[0]
+ ));
+ }else{
+
+ $duration = null;
+ }
+ }
$out["video"][] = [
- "title" => $result["title"],
- "description" => $result["description"],
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $title[0]
+ )
+ ),
+ "description" => $description,
"author" => [
- "name" => isset($result["table"]["Author"]) ? $result["table"]["Author"] : null,
+ "name" => $author,
"url" => null,
"avatar" => null
],
- "date" => $result["date"],
- "duration" => isset($result["table"]["Duration"]) ? $this->hms2int($result["table"]["Duration"]) : null,
+ "date" => $date,
+ "duration" => $duration,
"views" => null,
- "thumb" => $result["thumb"],
- "url" => $result["url"]
+ "thumb" => $thumb,
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $url[0]["attributes"]["href"]
+ )
];
}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage