From 4b0d8f75dc754974e8e809f25bdf4d456ae6cd6a Mon Sep 17 00:00:00 2001 From: lolcat Date: Sun, 19 Jan 2025 14:02:24 -0500 Subject: google scraper fix --- scraper/google.php | 4871 ++++++++++++++++++---------------------------------- 1 file changed, 1635 insertions(+), 3236 deletions(-) diff --git a/scraper/google.php b/scraper/google.php index 4f02648..d2c9eda 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -504,27 +504,41 @@ class google{ } } - private function get($proxy, $url, $get = []){ - - $headers = [ - "User-Agent: " . config::USER_AGENT, - "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", - "Accept-Language: en-US,en;q=0.5", - "Accept-Encoding: gzip", - "DNT: 1", - //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", - "Connection: keep-alive", - "Upgrade-Insecure-Requests: 1", - "Sec-Fetch-Dest: document", - "Sec-Fetch-Mode: navigate", - "Sec-Fetch-Site: none", - "Sec-Fetch-User: ?1", - "Priority: u=1", - "TE: trailers" - ]; + private function get($proxy, $url, $get = [], $use_lynx = false){ $curlproc = curl_init(); + if($use_lynx === false){ + + $headers = [ + "User-Agent: " . config::USER_AGENT, + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", + "Accept-Language: en-US,en;q=0.5", + "Accept-Encoding: gzip", + "DNT: 1", + //"Cookie: SOCS=CAESNQgCEitib3FfaWRlbnRpdHlmcm9udGVuZHVpc2VydmVyXzIwMjQwMzE3LjA4X3AwGgJlbiAEGgYIgM7orwY", + "Connection: keep-alive", + "Upgrade-Insecure-Requests: 1", + "Sec-Fetch-Dest: document", + "Sec-Fetch-Mode: navigate", + "Sec-Fetch-Site: none", + "Sec-Fetch-User: ?1", + "Priority: u=1", + "TE: trailers" + ]; + + // use http2 + curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + }else{ + + $headers = [ + "Accept: text/html, text/plain, text/sgml, */*;q=0.01", + "Accept-Encoding: gzip, compress, bzip2", + "Accept-Language: en", + "User-Agent: Lynx/2.9.0dev.12 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/3.7.8" + ]; + } + if($get !== []){ $get = http_build_query($get); $url .= "?" . $get; @@ -535,9 +549,6 @@ class google{ curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); - // use http2 - curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); - curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); @@ -557,12 +568,16 @@ class google{ } curl_close($curlproc); + + if($use_lynx){ + + return mb_convert_encoding($data, "UTF-8", "ISO-8859-1"); + } + return $data; } - - private function parsepage($html, $pagetype, $search, $proxy, $params){ $out = [ @@ -616,6 +631,7 @@ class google{ // // load result column // + $result_div = $this->fuckhtml ->getElementById( @@ -630,3528 +646,1903 @@ class google{ $this->fuckhtml->load($result_div); + // important for later + $last_page = false; + // - // Get word corrections + // Get text results // - $correction = + $results = $this->fuckhtml - ->getElementById( - "fprs", - "p" + ->getElementsByClassName( + "g", + "div" ); - if($correction){ + $this->skip_next = false; + + foreach($results as $result){ + + if($this->skip_next){ + + $this->skip_next = false; + continue; + } + + $this->fuckhtml->load($result); - $this->fuckhtml->load($correction); + $web = [ + "title" => null, + "description" => null, + "url" => null, + "date" => null, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => [], + "table" => [] + ]; - $a = + // Detect presence of sublinks + $g = $this->fuckhtml - ->getElementsByTagName( - "a" + ->getElementsByClassName( + "g", + "div" ); - - $using = + + if(count($g) > 0){ + + // skip on next iteration + $this->skip_next = true; + } + + // get title + $h3 = $this->fuckhtml - ->getElementById( - "fprsl", - $a + ->getElementsByTagName( + "h3" ); - if($using){ - - $using = - $this->fuckhtml - ->getTextContent( - $using - ); - - $spans = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - $type_span = - $this->fuckhtml - ->getTextContent( - $spans[0] - ); - - $type = "not_many"; - - if( - stripos( - $type_span, - "Showing results for" - ) !== false - ){ - - $type = "including"; - } + if(count($h3) === 0){ - $correction = + continue; + } + + $web["title"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $a[count($a) - 1] - ); + $h3[0] + ) + ); + + // get url + $as = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + $web["url"] = + $this->unshiturl( + $as[0] + ["attributes"] + ["href"] + ); + + if( + !preg_match( + '/^http/', + $web["url"] + ) + ){ - $out["spelling"] = [ - "type" => $type, - "using" => $using, - "correction" => $correction - ]; + // skip if invalid url is found + continue; } - // reset - $this->fuckhtml->load($result_div); - }else{ - - // get the "Did you mean?" prompt - $taw = + // + // get viewcount, time posted and follower count from tag + // + $cite = $this->fuckhtml - ->getElementById( - "taw" + ->getElementsByTagName( + "cite" ); - if($taw){ + if(count($cite) !== 0){ - $this->fuckhtml->load($taw); + $this->fuckhtml->load($cite[0]); - $as = + $spans = $this->fuckhtml - ->getElementsByTagName( - "a" - ); + ->getElementsByTagName("span"); - if(count($as) !== 0){ + if(count($spans) === 0){ - $text = - $this->fuckhtml - ->getTextContent( - $as[0] + $cites = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $cite[0] + ) ); - // @TODO implement did_you_mean - $out["spelling"] = [ - "type" => "including", - "using" => $search, - "correction" => $text - ]; + foreach($cites as $cite){ + + $cite = trim($cite); + + if( + preg_match( + '/(.+) (views|followers|likes)$/', + $cite, + $match + ) + ){ + + $web["table"][ucfirst($match[2])] = + $match[1]; + }elseif( + preg_match( + '/ago$/', + $cite + ) + ){ + + $web["date"] = + strtotime($cite); + } + } } + + // reset + $this->fuckhtml->load($result); } - $this->fuckhtml->load($result_div); - } - - // - // get notices - // - $botstuff = - $this->fuckhtml - ->getElementById( - "botstuff" - ); - - // important for later - $last_page = false; - - if($botstuff){ - - $this->fuckhtml->load($botstuff); - - $cards = + // + // attempt to fetch description cleanly + // + $description = $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "line-height" => "normal" - ] - ), - "div" + ->getElementsByAttributeValue( + "style", + "-webkit-line-clamp:2" ); - foreach($cards as $card){ - - $this->fuckhtml->load($card); - - $h2 = - $this->fuckhtml - ->getElementsByTagName( - "h2" - ); + if(count($description) !== 0){ - if(count($h2) !== 0){ - - $title = + $web["description"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $h2[0] - ); - - $card["innerHTML"] = - str_replace( - $h2[0]["outerHTML"], - "", - $card["innerHTML"] - ); - }else{ - - $title = "Notice"; - } - - $div = - $this->fuckhtml - ->getElementsByTagName( - "div" + $description[0] + ) ); + }else{ - // probe for related searches div, if found, ignore it cause its shit - $probe = + // use ANOTHER method where the description is a header of the result + $description = $this->fuckhtml ->getElementsByAttributeValue( - "role", - "list", - $div + "data-attrid", + "wa:/description" ); - // also probe for children - if(count($probe) === 0){ + if(count($description) !== 0){ - $probe = + // get date off that shit + $date = $this->fuckhtml ->getElementsByClassName( $this->getstyle( [ - "flex-shrink" => "0", - "-moz-box-flex" => "0", - "flex-grow" => "0", - "overflow" => "hidden" + "font-size" => "12px", + "line-height" => "1.34", + "display" => "inline-block", + "font-family" => "google sans,arial,sans-serif", + "padding-right" => "0", + "white-space" => "nowrap" ] ), - $div - ); - } - - if(count($probe) === 0){ - - $description = []; - - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" + "span" ); - if(count($as) !== 0){ - - $first = true; + if(count($date) !== 0){ - foreach($as as $a){ - - $text_link = - $this->fuckhtml + $description[0]["innerHTML"] = + str_replace( + $date[0]["outerHTML"], + "", + $description[0]["innerHTML"] + ); + + $web["date"] = + strtotime( + $this->fuckhtml ->getTextContent( - $a - ); - - if(stripos($text_link, "repeat the search") !== false){ - - $last_page = true; - break 2; - } + $date[0] + ) + ); + } + + $web["description"] = + $this->fuckhtml + ->getTextContent( + $description[0] + ); + }else{ + + // Yes.. You guessed it, use ANOTHER method to get descriptions + // off youtube containers + $description = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "-webkit-box-orient" => "vertical", + "display" => "-webkit-box", + "font-size" => "14px", + "-webkit-line-clamp" => "2", + "line-height" => "22px", + "overflow" => "hidden", + "word-break" => "break-word", + "color" => "#4d5156" + ] + ), + "div" + ); + + if(count($description) !== 0){ + + // check for video duration + $duration = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "background-color" => "rgba(0,0,0,0.6)", + "color" => "#fff", + "fill" => "#fff" + ] + ), + "div" + ); + + if(count($duration) !== 0){ - $parts = - explode( - $a["outerHTML"], - $card["innerHTML"], - 2 + $web["table"]["Duration"] = + $this->fuckhtml + ->getTextContent( + $duration[0] ); - - $card["innerHTML"] = $parts[1]; - - $value = - preg_replace( - '/ +/', - " ", + } + + $web["description"] = + $this->titledots( + html_entity_decode( $this->fuckhtml ->getTextContent( - $parts[0], - false, - false + $description[0] ) - ); - - if(strlen(trim($value)) !== 0){ - - $description[] = [ - "type" => "text", - "value" => $value - ]; - - if($first){ - - $description[0]["value"] = - ltrim($description[0]["value"]); - } - } - - $first = false; - - $description[] = [ - "type" => "link", - "url" => - $this->fuckhtml - ->getTextContent( - $a["attributes"] - ["href"] - ), - "value" => $text_link - ]; - } + ) + ); - $text = + // get author + time posted + $info = $this->fuckhtml - ->getTextContent( - $card["innerHTML"], - false, - false + ->getElementsByClassName( + $this->getstyle( + [ + "color" => "var(" . $this->getcolorvar("#70757a") . ")", + "font-size" => "14px", + "line-height" => "20px", + "margin-top" => "12px" + ] + ), + "div" ); - if(strlen(trim($text)) !== 0){ + if(count($info) !== 0){ - $description[] = [ - "type" => "text", - "value" => - rtrim( - $text + $info = + explode( + "·", + $this->fuckhtml + ->getTextContent( + $info[0] ) - ]; + ); + + switch(count($info)){ + + case 3: + $web["table"]["Author"] = trim($info[1]); + $web["date"] = strtotime(trim($info[2])); + break; + + case 2: + $web["date"] = strtotime(trim($info[1])); + break; + } } } - - if(count($description) !== 0){ - - $out["answer"][] = [ - "title" => $title, - "description" => $description, - "url" => null, - "thumb" => null, - "table" => [], - "sublink" => [] - ]; - } } } - // reset - $this->fuckhtml->load($html); - } - - // - // get "Related Searches" and "People also search for" - // - $relateds = - $this->fuckhtml - ->getElementsByClassName( - "wyccme", - "div" - ); - - foreach($relateds as $related){ - - $text = - $this->fuckhtml - ->getTextContent( - $related - ); - - if($text == "More results"){ continue; } - - $out["related"][] = $text; - } - - // - // Get text results - // - $results = - $this->fuckhtml - ->getElementsByClassName( - "g", - "div" - ); - - $this->skip_next = false; - - foreach($results as $result){ - - if($this->skip_next){ - - $this->skip_next = false; - continue; - } - - $this->fuckhtml->load($result); - - $web = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - // Detect presence of sublinks - $g = + // + // get categories of content within the search result + // + $cats = $this->fuckhtml - ->getElementsByClassName( - "g", + ->getElementsByAttributeName( + "data-sncf", "div" ); - $sublinks = []; - if(count($g) > 0){ + foreach($cats as $cat){ - $table = + $this->fuckhtml->load($cat); + + // detect image category + $images = $this->fuckhtml ->getElementsByTagName( - "table" + "img" ); - if(count($table) !== 0){ - - // found some sublinks! - - $this->fuckhtml->load($table[0]); - - $tds = - $this->fuckhtml - ->getElementsByTagName( - "td" - ); + if(count($images) !== 0){ - foreach($tds as $td){ - - $this->fuckhtml->load($td); - - $a = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); + foreach($images as $image){ - if( - count($a) === 0 || - ( - isset($a[0]["attributes"]["class"]) && - $a[0]["attributes"]["class"] == "fl" - ) - ){ + if(isset($image["attributes"]["id"])){ + // we found an image - continue; - } - - $td["innerHTML"] = - str_replace( - $a[0]["outerHTML"], - "", - $td["innerHTML"] - ); - - $web["sublink"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $a[0] - ) - ), - "description" => - html_entity_decode( - $this->titledots( - $this->fuckhtml - ->getTextContent( - $td - ) - ) - ), - "url" => - $this->unshiturl( - $a[0] - ["attributes"] - ["href"] - ), - "date" => null - ]; - } - - // reset - $this->fuckhtml->load($result); - } + if(isset($image["attributes"]["width"])){ - // skip on next iteration - $this->skip_next = true; - } - - // get title - $h3 = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($h3) === 0){ - - continue; - } - - $web["title"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $h3[0] - ) - ); - - // get url - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $web["url"] = - $this->unshiturl( - $as[0] - ["attributes"] - ["href"] - ); - - if( - !preg_match( - '/^http/', - $web["url"] - ) - ){ - - // skip if invalid url is found - continue; - } - - // - // probe for twitter carousel - // - $carousel = - $this->fuckhtml - ->getElementsByTagName( - "g-scrolling-carousel" - ); - - if(count($carousel) !== 0){ - - $this->fuckhtml->load($carousel[0]); + $width = (int)$image["attributes"]["width"]; + + if($width == 110){ + + $ratio = "1:1"; + }elseif($width > 110){ + + $ratio = "16:9"; + }else{ + + $ratio = "9:16"; + } + }else{ + + $ratio = "1:1"; + } + + $web["thumb"] = [ + "url" => $this->getdimg($image["attributes"]["id"]), + "ratio" => $ratio + ]; + + continue 2; + } + } + } - $items = + // Detect rating + $spans_unfiltered = $this->fuckhtml ->getElementsByTagName( - "g-inner-card" + "span" ); - $has_thumbnail = false; + $spans = + $this->fuckhtml + ->getElementsByAttributeName( + "aria-label", + $spans_unfiltered + ); - foreach($items as $item){ - - $this->fuckhtml->load($item); + foreach($spans as $span){ - if($has_thumbnail === false){ + if( + preg_match( + '/^Rated/', + $span["attributes"]["aria-label"] + ) + ){ - // get thumbnail - $thumb = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); + // found rating + // scrape rating + preg_match( + '/([0-9.]+).*([0-9.]+)/', + $span["attributes"]["aria-label"], + $rating + ); - if( - count($thumb) !== 0 && - isset($thumb[0]["attributes"]["id"]) - ){ + if(isset($rating[1])){ - $web["thumb"] = [ - "url" => - $this->getdimg( - $thumb[0]["attributes"]["id"] + $web["table"]["Rating"] = + $rating[1] . "/" . $rating[2]; + } + + $has_seen_reviews = 0; + foreach($spans_unfiltered as $span_unfiltered){ + + if( + preg_match( + '/([0-9,.]+) +([A-z]+)$/', + $this->fuckhtml + ->getTextContent( + $span_unfiltered ), - "ratio" => "16:9" - ]; + $votes + ) + ){ + + $has_seen_reviews++; + $web["table"][ucfirst($votes[2])] = $votes[1]; + continue; + } + + $text = + $this->fuckhtml + ->getTextContent( + $span_unfiltered + ); + + if( + $text == "   " || + $text == "" + ){ + + break; + } - $has_thumbnail = true; + switch($has_seen_reviews){ + + case 1: + // scrape price + $web["table"]["Price"] = $text; + $has_seen_reviews++; + break; + + case 2: + // scrape platform + $web["table"]["Platform"] = $text; + $has_seen_reviews++; + break; + + case 3: + // Scrape type + $web["table"]["Medium"] = $text; + break; + } } - // or else, try getting a thumbnail from next container + continue 2; } + } + + // check if its an answer header + $answer_header = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle( + [ + "overflow" => "hidden", + "text-overflow" => "ellipsis" + ] + ), + "span" + ); + + if(count($answer_header) !== 0){ - // cache div - $div = - $this->fuckhtml - ->getElementsByTagName( - "div" - ); - - // get link - $links = + $link = $this->fuckhtml ->getElementsByTagName( "a" ); - // get description of carousel sublink - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "heading", - $div - ); - - if(count($description) !== 0){ - - $description = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - }else{ - - $description = null; - } - - $bottom = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "z-index:2", - $div + $cat["innerHTML"] = + str_replace( + $link[0]["outerHTML"], + "", + $cat["innerHTML"] ); - $title = null; - $date = null; - if(count($bottom) !== 0){ - - $this->fuckhtml->load($bottom[0]); - - $spans = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - $title = + continue; + } + + // we probed everything, assume this is the description + // if we didn't find one cleanly previously + if($web["description"] === null){ + $web["description"] = + $this->titledots( $this->fuckhtml ->getTextContent( - $spans[0] - ); - - $date = - strtotime( - $this->fuckhtml - ->getTextContent( - $spans[count($spans) - 1] - ) - ); - } - - $web["sublink"][] = [ - "title" => $title, - "description" => $description, - "url" => - $this->unshiturl( - $links[0] - ["attributes"] - ["href"] - ), - "date" => $date - ]; + $cat + ) + ); } - - $out["web"][] = $web; - continue; } - // - // get viewcount, time posted and follower count from tag - // - $cite = - $this->fuckhtml - ->getElementsByTagName( - "cite" - ); + // check if description contains date + $description = explode("—", $web["description"], 2); - if(count($cite) !== 0){ - - $this->fuckhtml->load($cite[0]); - - $spans = - $this->fuckhtml - ->getElementsByTagName("span"); - - if(count($spans) === 0){ - - $cites = - explode( - "·", - $this->fuckhtml - ->getTextContent( - $cite[0] - ) - ); - - foreach($cites as $cite){ - - $cite = trim($cite); - - if( - preg_match( - '/(.+) (views|followers|likes)$/', - $cite, - $match - ) - ){ - - $web["table"][ucfirst($match[2])] = - $match[1]; - }elseif( - preg_match( - '/ago$/', - $cite - ) - ){ - - $web["date"] = - strtotime($cite); - } - } - } - - // reset - $this->fuckhtml->load($result); - } - - // - // attempt to fetch description cleanly - // - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:2" - ); - - if(count($description) !== 0){ - - $web["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ); - }else{ + if( + count($description) === 2 && + strlen($description[0]) <= 20 + ){ - // use ANOTHER method where the description is a header of the result - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "wa:/description" - ); + $date = strtotime($description[0]); - if(count($description) !== 0){ - - // get date off that shit - $date = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-size" => "12px", - "line-height" => "1.34", - "display" => "inline-block", - "font-family" => "google sans,arial,sans-serif", - "padding-right" => "0", - "white-space" => "nowrap" - ] - ), - "span" - ); - - if(count($date) !== 0){ - - $description[0]["innerHTML"] = - str_replace( - $date[0]["outerHTML"], - "", - $description[0]["innerHTML"] - ); - - $web["date"] = - strtotime( - $this->fuckhtml - ->getTextContent( - $date[0] - ) - ); - } - - $web["description"] = - $this->fuckhtml - ->getTextContent( - $description[0] - ); - }else{ - - // Yes.. You guessed it, use ANOTHER method to get descriptions - // off youtube containers - $description = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "-webkit-box-orient" => "vertical", - "display" => "-webkit-box", - "font-size" => "14px", - "-webkit-line-clamp" => "2", - "line-height" => "22px", - "overflow" => "hidden", - "word-break" => "break-word", - "color" => "#4d5156" - ] - ), - "div" - ); + if($date !== false){ - if(count($description) !== 0){ - - // check for video duration - $duration = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "background-color" => "rgba(0,0,0,0.6)", - "color" => "#fff", - "fill" => "#fff" - ] - ), - "div" - ); - - if(count($duration) !== 0){ - - $web["table"]["Duration"] = - $this->fuckhtml - ->getTextContent( - $duration[0] - ); - } - - $web["description"] = - $this->titledots( - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $description[0] - ) - ) - ); - - // get author + time posted - $info = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "color" => "var(" . $this->getcolorvar("#70757a") . ")", - "font-size" => "14px", - "line-height" => "20px", - "margin-top" => "12px" - ] - ), - "div" - ); - - if(count($info) !== 0){ - - $info = - explode( - "·", - $this->fuckhtml - ->getTextContent( - $info[0] - ) - ); - - switch(count($info)){ - - case 3: - $web["table"]["Author"] = trim($info[1]); - $web["date"] = strtotime(trim($info[2])); - break; - - case 2: - $web["date"] = strtotime(trim($info[1])); - break; - } - } - } + $web["date"] = $date; + $web["description"] = ltrim($description[1]); } } - // - // get categories of content within the search result - // - $cats = + // fetch youtube thumbnail + $thumbnail = $this->fuckhtml - ->getElementsByAttributeName( - "data-sncf", + ->getElementsByClassName( + $this->getstyle( + [ + "border-radius" => "8px", + "height" => "fit-content", + "justify-content" => "center", + "margin-right" => "20px", + "margin-top" => "4px", + "position" => "relative", + "width" => "fit-content" + ] + ), "div" ); - foreach($cats as $cat){ + if(count($thumbnail) !== 0){ - $this->fuckhtml->load($cat); + // load thumbnail container + $this->fuckhtml->load($thumbnail[0]); - // detect image category - $images = + $image = $this->fuckhtml ->getElementsByTagName( "img" ); - if(count($images) !== 0){ + if( + count($image) !== 0 && + isset($image[0]["attributes"]["id"]) + ){ - foreach($images as $image){ - - if(isset($image["attributes"]["id"])){ - // we found an image - - if(isset($image["attributes"]["width"])){ - - $width = (int)$image["attributes"]["width"]; - - if($width == 110){ - - $ratio = "1:1"; - }elseif($width > 110){ - - $ratio = "16:9"; - }else{ - - $ratio = "9:16"; - } - }else{ - - $ratio = "1:1"; - } - - $web["thumb"] = [ - "url" => $this->getdimg($image["attributes"]["id"]), - "ratio" => $ratio - ]; - - continue 2; - } - } - } - - // Detect rating - $spans_unfiltered = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - $spans = - $this->fuckhtml - ->getElementsByAttributeName( - "aria-label", - $spans_unfiltered - ); - - foreach($spans as $span){ - - if( - preg_match( - '/^Rated/', - $span["attributes"]["aria-label"] - ) - ){ - - // found rating - // scrape rating - preg_match( - '/([0-9.]+).*([0-9.]+)/', - $span["attributes"]["aria-label"], - $rating - ); - - if(isset($rating[1])){ - - $web["table"]["Rating"] = - $rating[1] . "/" . $rating[2]; - } - - $has_seen_reviews = 0; - foreach($spans_unfiltered as $span_unfiltered){ - - if( - preg_match( - '/([0-9,.]+) +([A-z]+)$/', - $this->fuckhtml - ->getTextContent( - $span_unfiltered - ), - $votes - ) - ){ - - $has_seen_reviews++; - $web["table"][ucfirst($votes[2])] = $votes[1]; - continue; - } - - $text = - $this->fuckhtml - ->getTextContent( - $span_unfiltered - ); - - if( - $text == "   " || - $text == "" - ){ - - break; - } - - switch($has_seen_reviews){ - - case 1: - // scrape price - $web["table"]["Price"] = $text; - $has_seen_reviews++; - break; - - case 2: - // scrape platform - $web["table"]["Platform"] = $text; - $has_seen_reviews++; - break; - - case 3: - // Scrape type - $web["table"]["Medium"] = $text; - break; - } - } - - continue 2; - } - } - - // check if its a table of small sublinks - $table = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "display" => "table", - "white-space" => "nowrap", - "margin" => "5px 0", - "line-height" => "1.58", - "color" => "var(" . $this->getcolorvar("#70757a") . ")" - ] - ), - "div" - ); - - if(count($table) !== 0){ - - $this->fuckhtml->load($table[0]); - - $rows = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "display" => "flex", - "white-space" => "normal" - ] - ), - "div" - ); - - foreach($rows as $row){ - - $this->fuckhtml->load($row); - - $sublink = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null - ]; - - $link = - $this->fuckhtml - ->getElementsByTagName( - "a" - )[0]; - - $sublink["title"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $link - ) - ); - - $sublink["url"] = - $this->unshiturl( - $link - ["attributes"] - ["href"] - ); - - $row["innerHTML"] = - str_replace( - $link["outerHTML"], - "", - $row["innerHTML"] - ); - - $this->fuckhtml->load($row); - - $spans = - $this->fuckhtml - ->getElementsByTagName( - "span" - ); - - foreach($spans as $span){ - - $text = - $this->fuckhtml - ->getTextContent( - $span - ); - - if( - preg_match( - '/answers?$/', - $text - ) - ){ - - $sublink["description"] = - $text; - - continue; - } - - $time = strtotime($text); - - if($time !== false){ - - $sublink["date"] = $time; - } - } - - $web["sublink"][] = $sublink; - } - - // reset - $this->fuckhtml->load($cat); - continue; - } - - // check if its an answer header - $answer_header = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "overflow" => "hidden", - "text-overflow" => "ellipsis" - ] - ), - "span" - ); - - if(count($answer_header) !== 0){ - - $link = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - $cat["innerHTML"] = - str_replace( - $link[0]["outerHTML"], - "", - $cat["innerHTML"] - ); - - $web["sublink"][] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $link[0] - ), - "description" => - $this->titledots( - trim( - str_replace( - "\xc2\xa0", - " ", - html_entity_decode( - $this->fuckhtml - ->getTextContent( - $cat - ) - ) - ), - " ·" - ) - ), - "url" => - $this->fuckhtml - ->getTextContent( - $link[0] - ["attributes"] - ["href"] - ), - "date" => null - ]; - - continue; - } - - // check if its list of small sublinks - $urls = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($urls) !== 0){ - - // found small links - foreach($urls as $url){ - - $target = - $this->fuckhtml - ->getTextContent( - $url - ["attributes"] - ["href"] - ); - - if( - !preg_match( - '/^http/', - $target - ) - ){ - - continue; - } - - $web["sublink"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $url - ) - ), - "description" => null, - "url" => $target, - "date" => null - ]; - } - - continue; - } - - // we probed everything, assume this is the description - // if we didn't find one cleanly previously - if($web["description"] === null){ - $web["description"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $cat - ) - ); - } - } - - // check if description contains date - $description = explode("—", $web["description"], 2); - - if( - count($description) === 2 && - strlen($description[0]) <= 20 - ){ - - $date = strtotime($description[0]); - - if($date !== false){ - - $web["date"] = $date; - $web["description"] = ltrim($description[1]); - } - } - - // fetch youtube thumbnail - $thumbnail = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "border-radius" => "8px", - "height" => "fit-content", - "justify-content" => "center", - "margin-right" => "20px", - "margin-top" => "4px", - "position" => "relative", - "width" => "fit-content" - ] - ), - "div" - ); - - if(count($thumbnail) !== 0){ - - // load thumbnail container - $this->fuckhtml->load($thumbnail[0]); - - $image = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - if( - count($image) !== 0 && - isset($image[0]["attributes"]["id"]) - ){ - - $web["thumb"] = [ - "url" => - $this->unshit_thumb( - $this->getdimg( - $image[0]["attributes"]["id"] - ) - ), - "ratio" => "16:9" - ]; - } - - // reset - $this->fuckhtml->load($result); - } - - $out["web"][] = $web; - } - - // reset - $this->fuckhtml->load($result_div); - - // - // Get instant answers - // - $answer_containers = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "padding-left" => "0px", - "padding-right" => "0px" - ] - ), - "div" - ); - - $date_class = - $this->getstyle( - [ - "font-size" => "12px", - "line-height" => "1.34", - "display" => "inline-block", - "font-family" => "google sans,arial,sans-serif", - "padding-right" => "0", - "white-space" => "nowrap" - ] - ); - - foreach($answer_containers as $container){ - - $this->fuckhtml->load($container); - - $web = [ - "title" => null, - "description" => null, - "url" => null, - "date" => null, - "type" => "web", - "thumb" => [ - "url" => null, - "ratio" => null - ], - "sublink" => [], - "table" => [] - ]; - - $answers = - $this->fuckhtml - ->getElementsByAttributeName( - "aria-controls", - "div" - ); - - $item_insert_pos = 1; - foreach($answers as $answer){ - - $out["related"][] = - $this->fuckhtml - ->getTextContent( - $answer - ); - - if( - isset( - $this->blobs[ - $answer - ["attributes"] - ["aria-controls"] - ] - ) - ){ - - $this->fuckhtml->load( - $this->blobs[ - $answer - ["attributes"] - ["aria-controls"] - ] - ); - - $divs = - $this->fuckhtml - ->getElementsByAttributeName( - "id", - "div" - ); - - foreach($divs as $div){ - - if( - !isset( - $this->blobs[ - $div - ["attributes"] - ["id"] - ] - ) - ){ - - continue; - } - - $this->fuckhtml->load( - $this->blobs[ - $div - ["attributes"] - ["id"] - ] - ); - - // get url - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($as) !== 0){ - - $web["url"] = - $this->unshiturl( - $as[0]["attributes"]["href"] - ); - - // skip entries that redirect to a search - if( - !preg_match( - '/^http/', - $web["url"] - ) - ){ - - continue 3; - } - } - - // get title - $h3 = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($h3) !== 0){ - - $web["title"] = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $h3[0] - ) - ); - } - - $description = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "wa:/description", - "div" - ); - - if(count($description) !== 0){ - - // check for date - $this->fuckhtml->load($description[0]); - - $date = - $this->fuckhtml - ->getElementsByClassName( - $date_class, - "span" - ); - - if(count($date) !== 0){ - - $description[0]["innerHTML"] = - str_replace( - $date[0]["outerHTML"], - "", - $description[0]["innerHTML"] - ); - - $web["date"] = - strtotime( - $this->fuckhtml - ->getTextContent( - $date[0] - ) - ); - } - - $web["description"] = - ltrim( - $this->fuckhtml - ->getTextContent( - $description[0] - ), - ": " - ); - } - } - - foreach($out["web"] as $item){ - - if($item["url"] == $web["url"]){ - - continue 2; - } - } - - array_splice($out["web"], $item_insert_pos, 0, [$web]); - $item_insert_pos++; - } - } - } - - // reset - $this->fuckhtml->load($result_div); - - // - // Scrape word definition - // - $definition_container = - $this->fuckhtml - ->getElementsByClassName( - "lr_container", - "div" - ); - - if(count($definition_container) !== 0){ - - $this->fuckhtml->load($definition_container[0]); - - // get header - $header = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "EntryHeader", - "div" - ); - - if(count($header) !== 0){ - - $description = []; - - $this->fuckhtml->load($header[0]); - - $title_div = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-family" => "google sans,arial,sans-serif", - "font-size" => "28px", - "line-height" => "36px" - ] - ) - ); - - if(count($title_div) !== 0){ - - $title = - $this->fuckhtml - ->getTextContent( - $title_div[0] - ); - }else{ - - $title = "Word definition"; - } - - $subtext_div = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "font-family" => "arial,sans-serif", - "font-size" => "14px", - "line-height" => "22px" - ] - ), - "span" - ); - - if(count($subtext_div) !== 0){ - - $description[] = [ - "type" => "quote", - "value" => - $this->fuckhtml - ->getTextContent( - $subtext_div[0] - ) - ]; - } - - // get audio - $audio = - $this->fuckhtml - ->getElementsByTagName( - "audio" - ); - - if(count($audio) !== 0){ - - $this->fuckhtml->load($audio[0]); - - $source = - $this->fuckhtml - ->getElementsByTagName( - "source" - ); - - if(count($source) !== 0){ - - $description[] = [ - "type" => "audio", - "url" => - preg_replace( - '/^\/\//', - "https://", - $this->fuckhtml - ->getTextContent( - $source[0] - ["attributes"] - ["src"] - ) - ) - ]; - } - - } - - // remove header to avoid confusion - $definition_container[0]["innerHTML"] = - str_replace( - $header[0]["outerHTML"], - "", - $definition_container[0]["innerHTML"] - ); - - // reset - $this->fuckhtml->load($definition_container[0]); - - $vmods = - $this->fuckhtml - ->getElementsByClassName( - "vmod", - "div" - ); - - foreach($vmods as $category){ - - if( - !isset( - $category - ["attributes"] - ["data-topic"] - ) || - $category - ["attributes"] - ["class"] != "vmod" - ){ - - continue; - } - - $this->fuckhtml->load($category); - - // get category type - $type = - $this->fuckhtml - ->getElementsByTagName( - "i" - ); - - if(count($type) !== 0){ - - $description[] = [ - "type" => "title", - "value" => - $this->fuckhtml - ->getTextContent( - $type[0] - ) - ]; - } - - // get heading text - $headings = - $this->fuckhtml - ->getElementsByClassName( - "xpdxpnd", - "div" - ); - - foreach($headings as $heading){ - - $description[] = [ - "type" => "quote", - "value" => - $this->fuckhtml - ->getTextContent( - $heading - ) - ]; - } - - $definitions = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "SenseDefinition", - "div" - ); - - $i = 1; - $text = []; - - foreach($definitions as $definition){ - - $text[] = - $i . ". " . - $this->fuckhtml - ->getTextContent( - $definition - ); - - $i++; - } - - if(count($text) !== 0){ - - $description[] = [ - "type" => "text", - "value" => - implode("\n", $text) - ]; - } - } - - $out["answer"][] = [ - "title" => $title, - "description" => $description, - "url" => null, - "thumb" => null, - "table" => [], - "sublink" => [] - ]; - } - - // reset - $this->fuckhtml->load($result_div); - } - - // - // scrape elements with a g-section-with-header - // includes: images, news carousels - // - - $g_sections = - $this->fuckhtml - ->getElementsByTagName( - "g-section-with-header" - ); - - if(count($g_sections) !== 0){ - foreach($g_sections as $g_section){ - - // parse elements with a g-section-with-header - $this->fuckhtml->load($g_section); - - $div_title = - $this->fuckhtml - ->getElementsByClassName( - "a-no-hover-decoration", - "a" - ); - - if(count($div_title) !== 0){ - - // title detected, skip - continue; - } - - // no title detected: detect news container - $news = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "outline-offset" => "-1px", - "outline-width" => "1px", - "display" => "flex", - "flex-direction" => "column", - "flex-grow" => "1" - ] - ) - ); - - foreach($news as $new){ - - $this->fuckhtml->load($new); - - $image = - $this->fuckhtml - ->getElementsByAttributeName( - "id", - "img" - ); - - if( - count($image) !== 0 && - !( - isset($image[0]["attributes"]["style"]) && - strpos( - $image[0]["attributes"]["style"], - "height:18px" - ) !== false - ) - ){ - - $thumb = [ - "url" => - $this->getdimg( - $image[0] - ["attributes"] - ["id"] - ), - "ratio" => "1:1" - ]; - } - - $title = - $this->titledots( - $this->fuckhtml - ->getTextContent( - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "heading", - "div" - )[0] - ) - ); - - $date_div = - $this->fuckhtml - ->getElementsByAttributeName( - "style", - "div" - ); - - $date = null; - - if(count($date_div) !== 0){ - - foreach($date_div as $div){ - - if( - strpos( - $div["attributes"]["style"], - "bottom:" - ) !== false - ){ - - $date = - strtotime( - $this->fuckhtml - ->getTextContent( - $div - ) - ); - break; - } - } - }else{ - - $date = null; - } - - $out["news"][] = [ - "title" => $title, - "description" => null, - "date" => $date, - "thumb" => $thumb, + $web["thumb"] = [ "url" => - $this->fuckhtml - ->getTextContent( - $new - ["attributes"] - ["href"] - ) - ]; - } - } - - // reset - $this->fuckhtml->load($result_div); - } - - // - // Parse images (carousel, left hand-side) - // - $image_carousels = - $this->fuckhtml - ->getElementsByAttributeValue( - "id", - "media_result_group", - "div" - ); - - if(count($image_carousels) !== 0){ - - foreach($image_carousels as $image_carousel){ - - $this->fuckhtml->load($image_carousel); - - // get related searches in image carousel - $relateds = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "display" => "inline-block", - "margin-right" => "6px", - "outline" => "none", - "padding" => "6px 0" - ], - "a" - ) - ); - - foreach($relateds as $related){ - - if(!isset($related["innerHTML"])){ - - // found an image - continue; - } - - $text = - $this->fuckhtml - ->getTextContent( - $related - ); - - if($text != ""){ - - $out["related"][] = $text; - } - } - - $div = - $this->fuckhtml - ->getElementsByTagName( - "div" - ); - - // get loaded images - $images = - $this->fuckhtml - ->getElementsByClassName( - "ivg-i", - $div - ); - - foreach($images as $image){ - - $this->fuckhtml->load($image); - - $img_tags = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - if( - !isset($image["attributes"]["data-docid"]) || - !isset($this->image_arr[$image["attributes"]["data-docid"]]) - ){ - - continue; - } - - // search for the right image tag - $image_tag = false; - foreach($img_tags as $img){ - - if( - isset( - $img - ["attributes"] - ["alt"] - ) && - trim( - $img - ["attributes"] - ["alt"] - ) != "" - ){ - - $image_tag = $img; - break; - } - } - - if($image_tag === false){ - - continue; - } - - $out["image"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $image_tag - ["attributes"] - ["alt"] + $this->unshit_thumb( + $this->getdimg( + $image[0]["attributes"]["id"] ) - ), - "source" => - $this->image_arr[ - $image - ["attributes"] - ["data-docid"] - ], - "url" => - $this->fuckhtml - ->getTextContent( - $image - ["attributes"] - ["data-lpage"] - ) - ]; - } - - // get unloaded javascript images - $images_js_sel = - $this->fuckhtml - ->getElementsByAttributeName( - "id", - $div - ); - - $loaded = []; - - foreach($images_js_sel as $sel){ - - if( - !isset($this->blobs[$sel["attributes"]["id"]]) || - in_array((string)$sel["attributes"]["id"], $loaded, true) - ){ - - // not an unloaded javascript image - continue; - } - - $loaded[] = $sel["attributes"]["id"]; - - // get yet another javascript component - $this->fuckhtml->load($this->blobs[$sel["attributes"]["id"]]); - - // get js node: contains title & url - $js_node = - $this->fuckhtml - ->getElementsByTagName( - "div" - )[0]; - - if(!isset($this->blobs[$js_node["attributes"]["id"]])){ - - // did not find refer id - continue; - } - - // load second javascript component - $this->fuckhtml->load($this->blobs[$js_node["attributes"]["id"]]); - - // get title from image alt text. - // data-src from this image is cropped, ignore it.. - $img = - $this->fuckhtml - ->getElementsByTagName( - "img" - )[0]; - - $out["image"][] = [ - "title" => - $this->fuckhtml - ->getTextContent( - $img["attributes"]["alt"] - ), - "source" => - $this->image_arr[ - $js_node["attributes"]["data-docid"] - ], - "url" => - $this->fuckhtml - ->getTextContent( - $js_node["attributes"]["data-lpage"] - ) + ), + "ratio" => "16:9" ]; } + + // reset + $this->fuckhtml->load($result); } - // reset - $this->fuckhtml->load($result_div); + $out["web"][] = $web; } - // - // Parse videos - // + // reset $this->fuckhtml->load($result_div); - $videos = - $this->fuckhtml - ->getElementsByAttributeName( - "data-vid", - "div" - ); - - foreach($videos as $video){ - - $this->fuckhtml->load($video); - - // get url - $url = - $this->fuckhtml - ->getTextContent( - $video - ["attributes"] - ["data-surl"] - ); - - foreach($out["web"] as $link){ + // + // craft $npt token + // + if( + $last_page === false && + count($out["web"]) !== 0 + ){ + if(!isset($params["start"])){ - if($link["url"] == $url){ - - // ignore if we already have the video in $out["web"] - continue 2; - } - } - - // get heading element - $heading = - $this->fuckhtml - ->getElementsByAttributeValue( - "role", - "heading", - "div" - ); - - if(count($heading) === 0){ + $params["start"] = 20; + }else{ - // no heading, fuck this. - continue; + $params["start"] += 20; } - // get thumbnail before loading heading object - $image = - $this->fuckhtml - ->getElementsByAttributeName( - "id", - "img" + $out["npt"] = + $this->backend + ->store( + json_encode($params), + $pagetype, + $proxy ); + } + + return $out; + } + + + private function scrape_dimg($html){ + + // get images loaded through javascript + $this->dimg = []; + + preg_match_all( + '/function\(\){google\.ldi=({.*?});/', + $html, + $dimg + ); + + if(isset($dimg[1])){ - if(count($image) !== 0){ - - $thumb = [ - "url" => $this->getdimg($image[0]["attributes"]["id"]), - "ratio" => "16:9" - ]; - }else{ + foreach($dimg[1] as $i){ - $thumb = [ - "url" => null, - "ratio" => null - ]; + $tmp = json_decode($i, true); + foreach($tmp as $key => $value){ + + $this->dimg[$key] = + $this->unshit_thumb( + $value + ); + } } + } + + // get additional javascript base64 images + preg_match_all( + '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', + $html, + $dimg + ); + + if(isset($dimg[1])){ - // get duration - $duration_div = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "border-radius" => "10px", - "font-family" => "arial,sans-serif-medium,sans-serif", - "font-size" => "12px", - "line-height" => "16px", - "padding-block" => "2px", - "padding-inline" => "8px" - ] - ), - "div" - ); - - if(count($duration_div) !== 0){ - - $duration = - $this->hms2int( - $this->fuckhtml - ->getTextContent( - $duration_div[0] - ) - ); - }else{ + for($i=0; $ifuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "background-color" => "#d93025", - "border-radius" => "10px", - "color" => "#fff", - "font-family" => "arial,sans-serif-medium,sans-serif", - "font-size" => "12px", - "line-height" => "16px", - "padding-block" => "2px", - "padding-inline" => "8px" - ] - ), - "span" + ->parseJsString( + $dimg[1][$i] ); - if(count($duration) !== 0){ - - $duration = "_LIVE"; - }else{ + foreach($delims as $delim){ - $duration = null; + $this->dimg[trim($delim, "'")] = $string; } } + } + } + + + private function scrape_imagearr($html){ + // get image links arrays + preg_match_all( + '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', + $html, + $image_arr + ); + + $this->image_arr = []; + if(isset($image_arr[1])){ - // load heading - $this->fuckhtml->load($heading[0]); - - // get title - $title = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( + for($i=0; $iimage_arr[$image_arr[1][$i]] = + [ [ - "font-family" => "arial,sans-serif", - "font-size" => "16px", - "font-weight" => "400", - "line-height" => "24px" + "url" => + $this->fuckhtml + ->parseJsString( + $image_arr[5][$i] + ), + "width" => (int)$image_arr[7][$i], + "height" => (int)$image_arr[6][$i] + ], + [ + "url" => + $this->unshit_thumb( + $this->fuckhtml + ->parseJsString( + $image_arr[2][$i] + ) + ), + "width" => (int)$image_arr[4][$i], + "height" => (int)$image_arr[3][$i] ] - ), - "div" - ); + ]; + } + } + } + + + private function getdimg($dimg){ + + return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; + } + + + private function unshit_thumb($url){ + // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj + // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA + + $parts = parse_url($url); + + if( + isset($parts["host"]) && + preg_match( + '/tbn.*\.gstatic\.com/', + $parts["host"] + ) + ){ - if(count($title) === 0){ + parse_str($parts["query"], $params); + + if(isset($params["q"])){ + + return "https://" . $parts["host"] . "/images?q=" . $params["q"]; + } + } + + return $url; + } + + + private function parsestyles(){ + + $styles = []; + + $style_div = + $this->fuckhtml + ->getElementsByTagName( + "style" + ); + + $raw_styles = ""; + + foreach($style_div as $style){ + + $raw_styles .= $style["innerHTML"]; + } + + // filter out media/keyframe queries + $raw_styles = + preg_replace( + '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', + "", + $raw_styles + ); + + // get styles + preg_match_all( + '/(.+?){([\S\s]*?)}/', + $raw_styles, + $matches + ); + + for($i=0; $ititledots( - $this->fuckhtml - ->getTextContent( - $title[0] - ) - ); - - // get date - $date_div = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "color" => "var(" . $this->getcolorvar("#70757a") . ")", - "font-size" => "14px" - ] - ), - "div" - ); + $names = explode(",", $matches[1][$i]); - if(count($date_div) !== 0){ + // h1,h2,h3 will each get their own array index + foreach($names as $name){ - $date = strtotime( - $this->fuckhtml - ->getTextContent( - $date_div[0] - ) - ); + $name = trim($name, "}\t\n\r\0\x0B"); - if($date === false){ + foreach($values as $key => $value){ - // failed to parse date - $date = null; + $styles[$name][$key] = $value; } - }else{ - - $date = null; } + } + + foreach($styles as $key => $values){ - $out["video"][] = [ - "title" => $title, - "description" => null, - "date" => $date, - "duration" => $duration, - "views" => null, - "thumb" => $thumb, - "url" => $url - ]; + $styles[$key]["_c"] = count($values); } - // - // Parse featured results (which contain images, fuck the rest desu) - // - $this->fuckhtml->load($html); - $top = - $this->fuckhtml - ->getElementsByAttributeValue( - "aria-label", - "Featured results", - "div" - ); + $this->styles = $styles; - if(count($top) !== 0){ - - $this->fuckhtml->load($top[0]); - - // get images - $grid = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "border-radius" => "20px", - "display" => "grid", - "grid-gap" => "2px", - "grid-template-rows" => "repeat(2,minmax(0,1fr))", - "overflow" => "hidden", - "bottom" => "0", - "left" => "0", - "right" => "0", - "top" => "0", - "position" => "absolute", - ] - ), - "div" - ); + // get CSS colors + $this->css_colors = []; + + if(isset($this->styles[":root"])){ - if(count($grid) !== 0){ - - // we found image grid - $this->fuckhtml->load($grid[0]); - - $images_div = - $this->fuckhtml - ->getElementsByAttributeName( - "data-attrid", - "div" - ); + foreach($this->styles[":root"] as $key => $value){ - foreach($images_div as $image_div){ - - $this->fuckhtml->load($image_div); - - $image = - $this->fuckhtml - ->getElementsByTagName( - "img" - ); - - if( - count($image) === 0 || - !isset($image_div["attributes"]["data-docid"]) || - !isset($this->image_arr[$image_div["attributes"]["data-docid"]]) - ){ - - // ?? no image, continue - continue; - } - - $out["image"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $image[0]["attributes"]["alt"] - ) - ), - "source" => - $this->image_arr[ - $image_div["attributes"]["data-docid"] - ], - "url" => - $this->fuckhtml - ->getTextContent( - $image_div["attributes"]["data-lpage"] - ) - ]; - } + $this->css_colors[$value] = strtolower($key); } } + } + + + + private function getstyle($styles){ + $styles["_c"] = count($styles); - // - // craft $npt token - // - if( - $last_page === false && - count($out["web"]) !== 0 - ){ - if(!isset($params["start"])){ + foreach($this->styles as $style_key => $style_values){ + + if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ - $params["start"] = 20; - }else{ + $style_key = + explode(" ", $style_key); - $params["start"] += 20; + $style_key = $style_key[count($style_key) - 1]; + + return + ltrim( + str_replace( + [".", "#"], + " ", + $style_key + ) + ); } - - $out["npt"] = - $this->backend - ->store( - json_encode($params), - $pagetype, - $proxy - ); } + return false; + } + + + + private function getcolorvar($color){ - // - // Parse right handside - // - $this->fuckhtml->load($html); - - $rhs = - $this->fuckhtml - ->getElementById( - "rhs" - ); - - if($rhs === null){ + if(isset($this->css_colors[$color])){ - return $out; + return $this->css_colors[$color]; } - $this->fuckhtml->load($rhs); - - // get images gallery - $image_gallery = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-rc", - "ivg-i", - "div" - ); + return null; + } + + + + public function web($get){ - if(count($image_gallery) !== 0){ - - $this->fuckhtml->load($image_gallery[0]); + if($get["npt"]){ - // get images - $images_div = - $this->fuckhtml - ->getElementsByClassName( - "ivg-i", - "div" - ); + [$get, $proxy] = $this->backend->get($get["npt"], "web"); - foreach($images_div as $image_div){ - - $this->fuckhtml->load($image_div); - - $image = - $this->fuckhtml - ->getElementsByTagName( - "img" + try{ + $html = + $this->get( + $proxy, + "https://www.google.com" . $get, + [], + true ); + }catch(Exception $error){ - if( - count($image) === 0 || - !isset( - $this->image_arr[ - $image_div - ["attributes"] - ["data-docid"] - ] - ) - ){ - - continue; - } + throw new Exception("Failed to get HTML"); + } + + }else{ + $search = $get["s"]; + $country = $get["country"]; + $nsfw = $get["nsfw"]; + $lang = $get["lang"]; + $older = $get["older"]; + $newer = $get["newer"]; + $spellcheck = $get["spellcheck"]; + $proxy = $this->backend->get_ip(); + + $offset = 0; + + $params = [ + "q" => $search, + "hl" => "en", + "num" => 20 + ]; + + // country + if($country != "any"){ - foreach($out["image"] as $existing_image){ - - // might already exist - if( - $existing_image["source"][1]["url"] == - $this->image_arr[ - $image_div - ["attributes"] - ["data-docid"] - ][1]["url"] - ){ - - continue 2; - } - } + $params["gl"] = $country; + } + + // nsfw + $params["safe"] = $nsfw == "yes" ? "off" : "active"; + + // language + if($lang != "any"){ - $out["image"][] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $image[0] - ["attributes"] - ["alt"] - ) - ), - "source" => - $this->image_arr[ - $image_div - ["attributes"] - ["data-docid"] - ], - "url" => - $this->fuckhtml - ->getTextContent( - $image_div - ["attributes"] - ["data-lpage"] - ) - ]; + $params["lr"] = "lang_" . $lang; } - // reset - $this->fuckhtml->load($rhs); - } - - // get header container - $header = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "padding" => "0 0 16px 20px", - "display" => "flex" - ] - ), - "div" - ); - - // stop parsing wikipedia heads if there isn't a header - $description = []; - $title = "About"; - - if(count($header) !== 0){ + // generate tbs + $tbs = []; - $this->fuckhtml->load($header[0]); + // get date + $older = $older === false ? null : date("m/d/Y", $older); + $newer = $newer === false ? null : date("m/d/Y", $newer); - // g-snackbar-action present: we found a button instead if( - count( - $this->fuckhtml - ->getElementsByTagName( - "g-snackbar-action" - ) - ) !== 0 + $older !== null || + $newer !== null ){ - $title_tag = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "title", - "div" - ); + $tbs["cdr"] = "1"; + $tbs["cd_min"] = $newer; + $tbs["cd_max"] = $older; + } + + // spellcheck filter + if($spellcheck == "no"){ - if(count($title_tag) !== 0){ - $title = - $this->fuckhtml - ->getTextContent( - $title_tag[0] - ); - - $header[0]["innerHTML"] = - str_replace( - $title_tag[0]["outerHTML"], - "", - $header[0]["innerHTML"] - ); - - // if header still contains text, add it as a subtitle in description - $subtitle = - $this->fuckhtml - ->getTextContent( - $header[0] - ); + $params["nfpr"] = "1"; + } + + if(count($tbs) !== 0){ + + $params["tbs"] = ""; + + foreach($tbs as $key => $value){ - if(strlen($subtitle) !== 0){ - - $description[] = [ - "type" => "quote", - "value" => $subtitle - ]; - } + $params["tbs"] .= $key . ":" . $value . ","; } + + $params["tbs"] = rtrim($params["tbs"], ","); + } + + try{ + $html = + $this->get( + $proxy, + "https://www.google.com/search", + $params, + true + ); + }catch(Exception $error){ + + throw new Exception("Failed to get HTML"); } - // reset - $this->fuckhtml->load($rhs); + //$html = file_get_contents("scraper/google.html"); } - // get description elements - $url = null; + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + $this->fuckhtml->load($html); + + $this->parsestyles(); - $text = + $boxes = $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "description", + ->getElementsByClassName( + $this->getstyle([ + "border" => "thin solid #dadce0", + "padding" => "12px 16px 12px 16px", + "margin-bottom" => "10px", + "font-family" => "sans-serif" + ]), "div" ); - if(count($text) !== 0){ + $skip_next = false; + + // get next page token + $npt = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "border" => "thin solid #dadce0", + "color" => "#70757a", + "font-size" => "14px", + "text-align" => "center", + "table-layout" => "fixed", + "width" => "100%" + ]), + "table" + ); + + if(count($npt) !== 0){ - $this->fuckhtml->load($text[0]); + $this->fuckhtml->load($npt[0]); - $a = + $as = $this->fuckhtml ->getElementsByTagName( "a" ); - if(count($a) !== 0){ - // get link and remove it from description - - $a = $a[count($a) - 1]; - - $text[0]["innerHTML"] = - str_replace( - $a["outerHTML"], - "", - $text[0]["innerHTML"] - ); + foreach($as as $a){ - $url = + $text = $this->fuckhtml ->getTextContent( $a - ["attributes"] - ["href"] ); - } - - $description[] = [ - "type" => "text", - "value" => - html_entity_decode( - preg_replace( - '/^Description/', - "", + + if( + $text == "Next >" || + $text == ">" + ){ + + $out["npt"] = + $this->backend->store( $this->fuckhtml ->getTextContent( - $text[0] - ) - ) - ) - ]; + $a["attributes"]["href"] + ), + "web", + $proxy + ); + } + } - // reset - $this->fuckhtml->load($rhs); + $this->fuckhtml->load($html); } - // get reviews (google play, steam, etc) - $review_container = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "align-items" => "start", - "display" => "flex" - ] - ), - "div" - ); - - if(count($review_container) !== 0){ + $first_box = true; + foreach($boxes as $box){ - $this->fuckhtml->load($review_container[0]); + $this->fuckhtml->load($box); - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($as) !== 0){ + if($first_box){ - $description[] = [ - "type" => "title", - "value" => "Ratings" - ]; + // + // Probe for word correction + // + $first_box = false; - foreach($as as $a){ - - $this->fuckhtml->load($a); + $txt = + $this->fuckhtml + ->getTextContent($box); + + if( + preg_match( + '/^Showing results for /', + $txt + ) + ){ - $spans = + $as = $this->fuckhtml ->getElementsByTagName( - "span" + "a" ); - if(count($spans) >= 2){ - - $value = - trim( - $this->fuckhtml - ->getTextContent( - $spans[1] - ), - "· " - ); - - if( - $value == "" && - isset($spans[2]) - ){ - - $value = - $this->fuckhtml - ->getTextContent( - $spans[2] - ); - } + if(count($as) === 2){ - $description[] = [ - "type" => "link", - "url" => + $out["spelling"] = [ + "type" => "including", + "using" => $this->fuckhtml ->getTextContent( - $a["attributes"] - ["href"] + $as[0] ), - "value" => $value - ]; - - $description[] = [ - "type" => "text", - "value" => - ": " . + "correction" => $this->fuckhtml ->getTextContent( - $spans[0] - ) . "\n" + $as[1] + ) ]; } + continue; } } - // reset - $this->fuckhtml->load($rhs); - } - - // initialize sublinks - $sublinks = []; - - // get description from business - if(count($description) === 0){ - - $data_attrid = - $this->fuckhtml - ->getElementsByAttributeName( - "data-attrid" - ); - - $summary = + // probe for custom container + $container_title = $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "kc:/local:one line summary", - $data_attrid + ->getElementsByClassName( + $this->getstyle([ + "font-weight" => "bold" + ]) ); - if(count($summary) !== 0){ + if(count($container_title) !== 0){ - $description[] = [ - "type" => "quote", - "value" => + $container_title = + strtolower( $this->fuckhtml ->getTextContent( - $summary[0] + $container_title[0] ) - ]; - - // remove summary so it doesnt get parsed as a table - $rhs["innerHTML"] = - str_replace( - $summary[0]["outerHTML"], - "", - $rhs["innerHTML"] ); - $this->fuckhtml->load($rhs); - } - - $address = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "kc:/location/location:address", - $data_attrid - ); - - if(count($address) !== 0){ - - $description[] = [ - "type" => "text", - "value" => + if($container_title == "images"){ + + // + // Parse image carousel + // + $images = $this->fuckhtml - ->getTextContent( - $address[0] - ) - ]; - } - - // get title - $title_div = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "title", - $data_attrid - ); - - if(count($title_div) !== 0){ - - $title = - $this->fuckhtml - ->getTextContent( - $title_div[0] - ); - } - - // get phone number - $phone = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "kc:/local:alt phone", - $data_attrid - ); - - if(count($phone) !== 0){ - - $this->fuckhtml->load($phone[0]); + ->getElementsByClassName( + $this->getstyle([ + "display" => "inline-block", + "padding" => "2px", + "padding-bottom" => "4px" + ]), + "a" + ); + + foreach($images as $image){ + + $this->fuckhtml->load($image); + + $image_data = + $this->unshiturl( + $image["attributes"]["href"], + true + ); + + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + )[0]; + + $out["image"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $img["attributes"]["alt"] + ) + ), + "source" => [ + [ + "url" => $image_data["url"], + "width" => $image_data["image_width"], + "height" => $image_data["image_height"] + ], + [ + "url" => + $this->fuckhtml + ->getTextContent( + $img["attributes"]["src"] + ), + "width" => $image_data["thumb_width"], + "height" => $image_data["thumb_height"] + ] + ], + "url" => $image_data["ref"] + ]; + } + + continue; + } - $sublinks["Call"] = - "tel:" . - $this->fuckhtml - ->getTextContent( + if( + $container_title == "related searches" || + $container_title == "people also search for" + ){ + + $as = $this->fuckhtml - ->getElementsByAttributeName( - "aria-label", + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]), "span" - )[0] - ); - - $this->fuckhtml->load($rhs); - } - } - - if(count($description) === 0){ - - // still no description? abort - return $out; - } - - // get table elements - $table = []; - $table_elems = - $this->fuckhtml - ->getElementsByClassName( - $this->getstyle( - [ - "margin-top" => "7px" - ] - ), - "div" - ); - - foreach($table_elems as $elem){ - - $this->fuckhtml->load($elem); + ); + + foreach($as as $a){ + + $out["related"][] = + $this->fuckhtml + ->getTextContent( + $a + ); + } + continue; + } + } - $spans = + // probe for website link + $link = $this->fuckhtml - ->getElementsByTagName( - "span" + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2", + "font-size" => "18px", + "line-height" => "24px" + ]), + "a" ); - if(count($spans) === 0){ + if(count($link) !== 0){ - // ?? invalid - continue; - } - - $elem["innerHTML"] = - str_replace( - $spans[0]["outerHTML"], - "", - $elem["innerHTML"] - ); - - $key = - rtrim( + // + // Parse search result + // + + $this->fuckhtml->load($link[0]); + + $title = $this->fuckhtml - ->getTextContent( - $spans[0] - ), - ": " - ); - - if( - $key == "" || - $key == "Phone" - ){ + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2", + "font-size" => "18px", + "line-height" => "24px" + ]), + "span" + ); - continue; - } - - if($key == "Hours"){ + if(count($title) === 0){ + + continue; + } - $hours = []; + $this->fuckhtml->load($box); - $this->fuckhtml->load($elem); + $sublinks = []; + $table = []; - $trs = + $categories = $this->fuckhtml - ->getElementsByTagName( - "tr" + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]), + "span" ); - foreach($trs as $tr){ + $i = 0; + foreach($categories as $category){ - $this->fuckhtml->load($tr); + $this->fuckhtml->load($category); - $tds = + // probe for sublinks + $subs = $this->fuckhtml - ->getElementsByTagName( - "td" + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "a" ); - if(count($tds) === 2){ + if(count($subs) !== 0){ - $hours[] = - $this->fuckhtml - ->getTextContent( - $tds[0] - ) . ": " . - $this->fuckhtml - ->getTextContent( - $tds[1] - ); + foreach($subs as $sub){ + + $url = + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $sub["attributes"]["href"] + ) + ); + + if( + preg_match( + '/^https?:\/\//', + $url + ) + ){ + + $sublinks[] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $sub + ) + ), + "description" => null, + "url" => + $this->unshiturl( + $this->fuckhtml + ->getTextContent( + $sub["attributes"]["href"] + ) + ), + "date" => null + ]; + } + } + + unset($categories[$i]); } - } - - if(count($hours) !== 0){ - $hours = implode("\n", $hours); - $table["Hours"] = $hours; + $i++; } - continue; - } - - $table[$key] = - preg_replace( - '/ +/', - " ", - $this->fuckhtml - ->getTextContent( - $elem - ) - ); - } - - // reset - $this->fuckhtml->load($rhs); - - // get the website div - $as = - $this->fuckhtml - ->getElementsByAttributeValue( - "data-attrid", - "visit_official_site", - "a" - ); - - if(count($as) !== 0){ - - $sublinks["Website"] = - str_replace( - "http://", - "https://", + // get description & date + $date = null; + + $categories = array_values($categories); + + //print_r($categories); + + $c = count($categories) - 1; + + $description = $this->fuckhtml ->getTextContent( - $as[0] - ["attributes"] - ["href"] - ) - ); - }else{ - - // get website through button - $button = - $this->fuckhtml - ->getElementsByClassName( - "ab_button", - "a" - ); - - if(count($button) !== 0){ - - $sublinks["Website"] = - $this->unshiturl( - $this->fuckhtml - ->getTextContent( - $button[0] - ["attributes"] - ["href"] - ) + $categories[$c] ); - } - } - - // get social media links - $as = - $this->fuckhtml - ->getElementsByTagName( - "g-link" - ); - - foreach($as as $a){ - - $this->fuckhtml->load($a); - - $link = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($link) === 0){ - continue; - } - - $sublink_title = - $this->fuckhtml - ->getTextContent( - $a - ); - - if($sublink_title == "X (Twitter)"){ + // remove last category since we're done with it + unset($categories[$c]); - $sublink_title = "Twitter"; - } - - $sublinks[$sublink_title] = - $this->fuckhtml - ->getTextContent( - $link[0] - ["attributes"] - ["href"] - ); - } - - // reset - $this->fuckhtml->load($rhs); - - // get those round containers - $containers = - $this->fuckhtml - ->getElementsByClassName( - "tpa-ci" - ); - - foreach($containers as $container){ - - $this->fuckhtml->load($container); - - $as = - $this->fuckhtml - ->getElementsByTagName( - "a" - ); - - if(count($as) === 0){ + // probe for date + $description_tmp = explode("·", $description, 2); + $date_tmp = strtotime(trim($description_tmp[0])); - continue; - } - - $sublinks[ - $this->fuckhtml - ->getTextContent( - $as[0] - ) - ] = - $this->fuckhtml - ->getTextContent( - $as[0] - ["attributes"] - ["href"] - ); - } - - $out["answer"][] = [ - "title" => $title, - "description" => $description, - "url" => $url, - "thumb" => null, - "table" => $table, - "sublink" => $sublinks - ]; - - return $out; - } - - - private function scrape_dimg($html){ - - // get images loaded through javascript - $this->dimg = []; - - preg_match_all( - '/function\(\){google\.ldi=({.*?});/', - $html, - $dimg - ); - - if(isset($dimg[1])){ - - foreach($dimg[1] as $i){ + if( + count($description_tmp) === 2 && + strlen($description_tmp[0]) <= 20 && + $date_tmp !== false + ){ + + $description = + ltrim( + $this->titledots( + $description_tmp[1] + ) + ); + $date = $date_tmp; + }else{ + + $description = + $this->titledots( + $description + ); + } - $tmp = json_decode($i, true); - foreach($tmp as $key => $value){ + // remaining categories should all be greytext + if(count($categories) !== 0){ - $this->dimg[$key] = - $this->unshit_thumb( - $value + $texts = + explode( + "·", + preg_replace( + '/\s+/', + " ", + $this->fuckhtml + ->getTextContent( + $categories[0] + ) + ) ); + + foreach($texts as $text){ + + $text = trim($text); + + if( + preg_match( + '/^Rating ([0-9.]+)(?: \(([0-9,]+)\))?/', + $text, + $rating + ) + ){ + + $table["Rating"] = $rating[1]; + if(isset($rating[2])){ + + $table["Rating"] .= " (" . $rating[2] . " votes)"; + } + + continue; + } + + if(stripos($text, "stock") !== false){ + + $table["Stock"] = $text; + continue; + } + } } - } - } - - // get additional javascript base64 images - preg_match_all( - '/var s=\'(data:image\/[^\']+)\';var ii=\[((?:\'[^\']+\',?)+)\];/', - $html, - $dimg - ); - - if(isset($dimg[1])){ + + $out["web"][] = [ + "title" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $title[0] + ) + ), + "description" => $description, + "url" => + $this->unshiturl( + $link[0]["attributes"]["href"] + ), + "date" => $date, + "type" => "web", + "thumb" => [ + "url" => null, + "ratio" => null + ], + "sublink" => $sublinks, + "table" => $table + ]; + + continue; + } - for($i=0; $ifuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "18px", + "line-height" => "24px" + ]), + "span" + ); + + if(count($wiki_title) !== 0){ - $delims = explode(",", $dimg[2][$i]); - $string = + $wiki_title = $this->fuckhtml - ->parseJsString( - $dimg[1][$i] + ->getTextContent( + $wiki_title[0] ); - foreach($delims as $delim){ + if($wiki_title == "See results about"){ - $this->dimg[trim($delim, "'")] = $string; + // ignore + continue; } - } - } - } - - - private function scrape_imagearr($html){ - // get image links arrays - preg_match_all( - '/\[0,"([^"]+)",\["([^"]+)\",([0-9]+),([0-9]+)\],\["([^"]+)",([0-9]+),([0-9]+)\]/', - $html, - $image_arr - ); - - $this->image_arr = []; - if(isset($image_arr[1])){ - - for($i=0; $iimage_arr[$image_arr[1][$i]] = - [ - [ - "url" => + if($wiki_title == "Top stories"){ + + // + // Parse news + // + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); + + foreach($tds as $td){ + + $this->fuckhtml->load($td); + + $a = + $this->fuckhtml + ->getElementsByTagName( + "a" + ); + + if(count($a) === 0){ + + continue; + } + + $title = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "span" + ); + + if(count($title) === 0){ + + continue; + } + + $date = null; + + $meta_div = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#70757a", + "font-size" => "13px", + "line-height" => "20px" + ]), + "span" + ); + + $meta_div = + explode( + "·", $this->fuckhtml - ->parseJsString( - $image_arr[5][$i] + ->getTextContent( + $meta_div[count($meta_div) - 1] ), - "width" => (int)$image_arr[7][$i], - "height" => (int)$image_arr[6][$i] - ], - [ - "url" => - $this->unshit_thumb( + 2 + ); + + if(count($meta_div) === 2){ + + $date = strtotime($meta_div[count($meta_div) - 1]); + + if($date === false){ + + $date = null; + } + } + + $out["news"][] = [ + "title" => + $this->titledots( $this->fuckhtml - ->parseJsString( - $image_arr[2][$i] + ->getTextContent( + $title[0] ) ), - "width" => (int)$image_arr[4][$i], - "height" => (int)$image_arr[3][$i] - ] - ]; - } - } - } - - - private function getdimg($dimg){ - - return isset($this->dimg[$dimg]) ? $this->dimg[$dimg] : null; - } - - - private function unshit_thumb($url){ - // https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQINE2vbnNLHXqoZr3RVsaEJFyOsj1_BiBnJch-e1nyz3oia7Aj5xVj - // https://i.ytimg.com/vi/PZVIyA5ER3Y/mqdefault.jpg?sqp=-oaymwEFCJQBEFM&rs=AMzJL3nXeaCpdIar-ltNwl82Y82cIJfphA - - $parts = parse_url($url); - - if( - isset($parts["host"]) && - preg_match( - '/tbn.*\.gstatic\.com/', - $parts["host"] - ) - ){ - - parse_str($parts["query"], $params); - - if(isset($params["q"])){ + "description" => null, + "date" => $date, + "thumb" => [ + "url" => null, + "ratio" => null + ], + "url" => + $this->unshiturl( + $a[0]["attributes"]["href"] + ) + ]; + } + continue; + } - return "https://" . $parts["host"] . "/images?q=" . $params["q"]; - } - } - - return $url; - } - - - private function parsestyles(){ - - $styles = []; - - $style_div = - $this->fuckhtml - ->getElementsByTagName( - "style" - ); - - $raw_styles = ""; - - foreach($style_div as $style){ - - $raw_styles .= $style["innerHTML"]; - } - - // filter out media/keyframe queries - $raw_styles = - preg_replace( - '/@\s*(?!font-face)[^{]+\s*{[\S\s]+?}\s*}/', - "", - $raw_styles - ); - - // get styles - preg_match_all( - '/(.+?){([\S\s]*?)}/', - $raw_styles, - $matches - ); - - for($i=0; $ifuckhtml + ->getElementsByTagName( + "table" + ); - $name = trim($name, "}\t\n\r\0\x0B"); + if(count($table_div) === 0){ + + continue; + } - foreach($values as $key => $value){ + $this->fuckhtml->load($table_div[0]); + + // remove table from box + $box["innerHTML"] = + str_replace( + $table_div[0]["outerHTML"], + "", + $box["innerHTML"] + ); + + // find wiki image + $thumb = null; + + $img = + $this->fuckhtml + ->getElementsByTagName( + "img" + ); + + if(count($img) !== 0){ - $styles[$name][$key] = $value; + $thumb = + $this->fuckhtml + ->getTextContent( + $img[0]["attributes"]["src"] + ); } - } - } - - foreach($styles as $key => $values){ - - $styles[$key]["_c"] = count($values); - } - - $this->styles = $styles; - - // get CSS colors - $this->css_colors = []; - - if(isset($this->styles[":root"])){ - - foreach($this->styles[":root"] as $key => $value){ - $this->css_colors[$value] = strtolower($key); - } - } - } - - - - private function getstyle($styles){ - - $styles["_c"] = count($styles); - - foreach($this->styles as $style_key => $style_values){ - - if(count(array_intersect_assoc($style_values, $styles)) === $styles["_c"] + 1){ + $tds = + $this->fuckhtml + ->getElementsByTagName( + "td" + ); - $style_key = - explode(" ", $style_key); + $description = []; - $style_key = $style_key[count($style_key) - 1]; + foreach($tds as $td){ + + // probe for subtitle + $this->fuckhtml->load($td); + + $subtext = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#70757a", + "font-size" => "13px", + "line-height" => "20px" + ]) + ); + + if(count($subtext) !== 0){ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $subtext[0] + ) + ]; + break; + } + } - return - ltrim( - str_replace( - [".", "#"], - " ", - $style_key - ) + $this->fuckhtml->load($box); + + // probe for word definition + $lists = + $this->fuckhtml + ->getElementsByTagName( + "ol" ); - } - } - - return false; - } - - - - private function getcolorvar($color){ - - if(isset($this->css_colors[$color])){ - - return $this->css_colors[$color]; - } - - return null; - } - - - - public function web($get){ - - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "web"); - $params = json_decode($params, true); - - $search = $params["q"]; - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - $params = [ - "q" => $search, - "hl" => "en", - "num" => 20 // get 20 results - ]; - - // country - if($country != "any"){ - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ + if(count($lists) !== 0){ + + $description = []; + + foreach($lists as $list){ + + $box["innerHTML"] = + explode( + $list["outerHTML"], + $box["innerHTML"], + 2 + ); + + if( + count($box["innerHTML"]) === 1 || + trim($box["innerHTML"][0]) == "" + ){ + + break; + } + + $description[] = [ + "type" => "title", + "value" => + $this->fuckhtml + ->getTextContent( + $box["innerHTML"][0] + ) + ]; + + $this->fuckhtml->load($list); + + $lis = + $this->fuckhtml + ->getElementsByTagName( + "li" + ); + + $increment = 1; + + foreach($lis as $li){ + + $this->fuckhtml->load($li); + + $list_items = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#202124", + "font-size" => "13px", + "line-height" => "20px" + ]) + ); + + $first_item = true; + foreach($list_items as $it){ + + if($first_item){ + + $first_item = false; + $c = count($description); + + if( + $c !== 0 && + $description[$c - 1]["type"] == "text" + ){ + + $description[$c - 1]["value"] .= + "\n\n" . + $increment . ". " . $this->fuckhtml + ->getTextContent( + $it + ); + }else{ + + $description[] = [ + "type" => "text", + "value" => + $increment . ". " . $this->fuckhtml + ->getTextContent( + $it + ) + ]; + } + }else{ + + $description[] = [ + "type" => "quote", + "value" => + $this->fuckhtml + ->getTextContent( + $it + ) + ]; + } + + $increment++; + } + } + + $box["innerHTML"] = $box["innerHTML"][1]; + } + + $out["answer"][] = [ + "title" => $wiki_title, + "description" => $description, + "url" => null, + "thumb" => null, + "table" => [], + "sublink" => [] + ]; + + continue; + } - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ + // get separator between description and facts + $separator = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "height" => "4px" + ]), + "div" + ); - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ + $box_html = []; + $table = []; - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ + if(count($separator) !== 0){ + + $box_html = + explode( + $separator[0]["outerHTML"], + $box["innerHTML"], + 2 + ); + + if(count($box_html) === 2){ + + $box["innerHTML"] = $box_html[0]; + } + + $this->fuckhtml->load($box_html[1]); + + // get all facts + $facts = + $this->fuckhtml + ->getElementsByTagName( + "div" + ); + + foreach($facts as $fact){ + + if($fact["level"] !== 1){ continue; } + + $fact = + explode( + ":", + $this->fuckhtml + ->getTextContent( + $fact + ) + ); + + $table[trim(preg_replace('/\s+/', " ", $fact[0]))] = + trim(preg_replace('/\s+/', " ", $fact[1])); + } + + $this->fuckhtml->load($box); + } - $params["tbs"] = ""; + // remove wikipedia link + $wiki_link = + $this->fuckhtml + ->getElementsByClassName( + $this->getstyle([ + "color" => "#1967d2" + ]), + "a" + ); - foreach($tbs as $key => $value){ + $url = null; + if(count($wiki_link) !== 0){ - $params["tbs"] .= $key . ":" . $value . ","; + foreach($wiki_link as $link){ + + if( + strtolower( + $this->fuckhtml + ->getTextContent( + $link + ) + ) == "wikipedia" + ){ + + $box["innerHTML"] = + str_replace( + $link["outerHTML"], + "", + $box["innerHTML"] + ); + + $url = + $this->unshiturl( + $link["attributes"]["href"] + ); + + $this->fuckhtml->load($box); + break; + } + } } - $params["tbs"] = rtrim($params["tbs"], ","); + // remains of box should be description + $description[] = [ + "type" => "text", + "value" => + $this->titledots( + $this->fuckhtml + ->getTextContent( + $box + ) + ) + ]; + + $out["answer"][] = [ + "title" => $wiki_title, + "description" => $description, + "url" => $url, + "thumb" => $thumb, + "table" => $table, + "sublink" => [] + ]; } } - try{ - $html = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$html = file_get_contents("scraper/google.txt"); - - return $this->parsepage($html, "web", $search, $proxy, $params); + return $out; } @@ -4835,19 +3226,28 @@ class google{ // decode $url = $this->fuckhtml - ->getTextContent($url); + ->getTextContent( + $url + ); $url_parts = parse_url($url); + if(isset($url_parts["query"])){ + + parse_str($url_parts["query"], $query); + }else{ + + $query = []; + } + if( !isset( $url_parts["host"] - ) + ) || + stripos($url_parts["host"], "google.") !== false ){ // no host, we have a tracking url - parse_str($url_parts["query"], $query); - if(isset($query["imgurl"])){ $url = $query["imgurl"]; @@ -4908,20 +3308,20 @@ class google{ ){ // remove referrers from play.google.com - $oldquery = parse_url($url, PHP_URL_QUERY); - if($oldquery !== null){ + $u_query = parse_url($url, PHP_URL_QUERY); + if($u_query !== null){ - parse_str($oldquery, $query); - if(isset($query["referrer"])){ unset($query["referrer"]); } - if(isset($query["hl"])){ unset($query["hl"]); } - if(isset($query["gl"])){ unset($query["gl"]); } + parse_str($u_query, $u_query); + if(isset($u_query["referrer"])){ unset($u_query["referrer"]); } + if(isset($u_query["hl"])){ unset($u_query["hl"]); } + if(isset($u_query["gl"])){ unset($u_query["gl"]); } $query = http_build_query($query); $url = str_replace( - $oldquery, - $query, + $u_query, + $u_query, $url ); } @@ -4934,18 +3334,18 @@ class google{ ) ){ // remove more referrers from twitter.com - $oldquery = parse_url($url, PHP_URL_QUERY); - if($oldquery !== null){ + $u_query = parse_url($url, PHP_URL_QUERY); + if($u_query !== null){ - parse_str($oldquery, $query); - if(isset($query["ref_src"])){ unset($query["ref_src"]); } + parse_str($u_query, $u_query); + if(isset($u_query["ref_src"])){ unset($u_query["ref_src"]); } - $query = http_build_query($query); + $u_query = http_build_query($u_query); $url = str_replace( $oldquery, - $query, + $u_query, $url ); } @@ -4960,18 +3360,17 @@ class google{ if(stripos($url, "maps?") !== false){ - //https://maps.google.com/maps?daddr=Johnny,+603+Rue+St+Georges,+Saint-J%C3%A9r%C3%B4me,+Quebec+J7Z+5B7 - $query = parse_url($url, PHP_URL_QUERY); + $u_query = parse_url($url, PHP_URL_QUERY); - if($query !== null){ + if($u_query !== null){ - parse_str($query, $query); + parse_str($u_query, $u_query); - if(isset($query["daddr"])){ + if(isset($u_query["daddr"])){ $url = "https://maps.google.com/maps?daddr=" . - urlencode($query["daddr"]); + urlencode($u_query["daddr"]); } } } -- cgit v1.2.3