aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/google.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-10-08 00:42:36 -0400
committerlolcat <will@lolcat.ca>2025-10-08 00:42:36 -0400
commita4a44709b4ee1dffaca8b7f79b3c0814914a58f7 (patch)
tree1105658aaafdaec94205a3d1c032ba967fd5ac20 /scraper/google.php
parent4b16fd58971321e6938046702210e2773573c621 (diff)
google quote on quote fix
Diffstat (limited to 'scraper/google.php')
-rw-r--r--scraper/google.php461
1 files changed, 1 insertions, 460 deletions
diff --git a/scraper/google.php b/scraper/google.php
index c83b084..049c844 100644
--- a/scraper/google.php
+++ b/scraper/google.php
@@ -561,466 +561,7 @@ class google{
public function web($get){
- if($get["npt"]){
-
- [$params, $proxy] = $this->backend->get($get["npt"], "web");
-
- $params = json_decode($params, true);
-
- try{
- $json =
- $this->get(
- $proxy,
- "https://www.google.com/search",
- $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get HTML");
- }
-
- }else{
- $search = $get["s"];
- $country = $get["country"];
- $nsfw = $get["nsfw"];
- $lang = $get["lang"];
- $older = $get["older"];
- $newer = $get["newer"];
- $spellcheck = $get["spellcheck"];
- $proxy = $this->backend->get_ip();
-
- $offset = 0;
-
- /*
- https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
-
- https://www.google.com/search?udm=14&
- yv=3&
- q=asmr&
- biw=1920&
- bih=947&
- start=0&
- sa=N&
- asearch=arc&
- cs=1&
- async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc
- */
-
- $params = [
- "udm" => 14,
- "yv" => 3,
- "q" => $search,
- "biw" => 1920,
- "bih" => 947,
- "start" => 0,
- "sa" => "N",
- "asearch" => "arc",
- "cs" => 1,
- "async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc",
- "hl" => "en",
- "num" => 20
- ];
-
- // country
- if($country != "any"){
-
- $params["gl"] = $country;
- }
-
- // nsfw
- $params["safe"] = $nsfw == "yes" ? "off" : "active";
-
- // language
- if($lang != "any"){
-
- $params["lr"] = "lang_" . $lang;
- }
-
- // generate tbs
- $tbs = [];
-
- // get date
- $older = $older === false ? null : date("m/d/Y", $older);
- $newer = $newer === false ? null : date("m/d/Y", $newer);
-
- if(
- $older !== null ||
- $newer !== null
- ){
-
- $tbs["cdr"] = "1";
- $tbs["cd_min"] = $newer;
- $tbs["cd_max"] = $older;
- }
-
- // spellcheck filter
- if($spellcheck == "no"){
-
- $params["nfpr"] = "1";
- }
-
- if(count($tbs) !== 0){
-
- $params["tbs"] = "";
-
- foreach($tbs as $key => $value){
-
- $params["tbs"] .= $key . ":" . $value . ",";
- }
-
- $params["tbs"] = rtrim($params["tbs"], ",");
- }
-
- try{
- $json =
- $this->get(
- $proxy,
- "https://www.google.com/search",
- $params
- );
- }catch(Exception $error){
-
- throw new Exception("Failed to get HTML");
- }
-
- //$json = file_get_contents("scraper/google.js");
- }
-
- $out = [
- "status" => "ok",
- "spelling" => [
- "type" => "no_correction",
- "using" => null,
- "correction" => null
- ],
- "npt" => null,
- "answer" => [],
- "web" => [],
- "image" => [],
- "video" => [],
- "news" => [],
- "related" => []
- ];
-
- $this->fuckhtml->load($json);
- $this->detect_sorry();
-
- // get next page
- /*
- $npt =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-state-token",
- "div"
- );
-
- if(count($npt) !== 0){
-
- $params["sstk"] =
- $this->fuckhtml
- ->getTextContent(
- $npt[0]["attributes"]["data-state-token"]
- );
-
- $params["start"] += 10;
-
- $out["npt"] =
- $this->backend->store(
- json_encode($params),
- "web",
- $proxy
- );
- }*/
-
- // get invididual results
- $results =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-hveid",
- "div"
- );
-
- foreach($results as $result){
-
- $this->fuckhtml->load($result);
-
- //echo $result["innerHTML"];
-
- $snfs =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-snf",
- "div"
- );
-
- $title = null;
- $description = null;
- $link = null;
- $sublinks = [];
- $date = null;
- $thumb = [
- "ratio" => null,
- "url" => null
- ];
- $table = [];
-
- // probe for title
- $title_node =
- $this->fuckhtml
- ->getElementsByTagName(
- "h3"
- );
-
- if(count($title_node) !== 0){
-
- // found a title node
- $title =
- $this->fuckhtml
- ->getTextContent(
- $title_node[0]
- );
- }
-
- if($title === null){
-
- // should not happen
- continue;
- }
-
- foreach($snfs as $snf){
-
- $this->fuckhtml->load($snf);
-
- // probe for thumbnail
- $thumbnail =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "alt",
- "img"
- );
-
- foreach($thumbnail as $t){
-
- if(
- isset($t["attributes"]["style"]) &&
- preg_match(
- '/height ?: ?([0-9]+)px/',
- $t["attributes"]["style"],
- $match
- ) &&
- (int)$match[1] < 40
- ){
-
- // found a favicon, ignore
- continue;
- }
-
- $thumb = [
- "ratio" => "1:1",
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $thumbnail[0]["attributes"]["src"]
- )
- ];
-
- continue 2;
- }
-
- // probe for description
- if($description === null){
-
- // probe 1
- if(
- isset($snf["attributes"]["data-sncf"]) &&
- $snf["attributes"]["data-sncf"] == "1,2"
- ){
-
- $description =
- $this->fuckhtml
- ->getTextContent(
- $snf
- );
- continue;
- }
-
- // probe 2
- $desc_probe =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "style",
- "-webkit-line-clamp:2",
- "div"
- );
-
- if(count($desc_probe) !== 0){
-
- $description =
- $this->fuckhtml
- ->getTextContent(
- $desc_probe[0]
- );
- continue;
- }
- }
-
- // probe for links
- $links =
- $this->fuckhtml
- ->getElementsByAttributeName(
- "data-sb",
- "a"
- );
-
- if(isset($links[0]["attributes"]["data-ved"])){
-
- // found the page link
- $link =
- $this->fuckhtml
- ->getTextContent(
- $links[0]["attributes"]["href"]
- );
-
- continue;
- }
-
- if(count($links) !== 0){
-
- // get all sublinks
- for($i=0; $i<count($links); $i++){
-
- $sublinks[] = [
- "title" =>
- $this->titledots(
- $this->fuckhtml
- ->getTextContent(
- $links[$i]
- )
- ),
- "description" => null,
- "date" => null,
- "url" =>
- $this->fuckhtml
- ->getTextContent(
- $links[$i]["attributes"]["href"]
- )
- ];
- }
-
- continue;
- }
-
- // get tabloid-able data
- $tabloid =
- $this->fuckhtml
- ->getElementsByAttributeValue(
- "style",
- "margin-top:0px",
- "div"
- );
-
- if(count($tabloid) === 0){
-
- // try getting <cite> instead
- $tabloid =
- $this->fuckhtml
- ->getElementsByTagName(
- "cite"
- );
- }
-
- if(count($tabloid) !== 0){
-
- // found table
- $tabloid =
- explode("·", $tabloid[0]["innerHTML"]);
-
- foreach($tabloid as $tbl){
-
- $preg =
- $this->fuckhtml
- ->getTextContent(
- $tbl
- );
-
- //$table[random_int(0,1000)] = $preg;
-
- if(
- // match price
- preg_match(
- '/(\p{Sc}[^\p{Sc}]+)/',
- $preg,
- $match
- )
- ){
-
- $table["Price"] = trim($match[1]);
- }
-
- if(
- // match in stock/delivery
- preg_match(
- '/(stock|delivery|returns)/i',
- $preg,
- $match
- )
- ){
-
- $table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0");
- }
- }
- continue;
- }
- }
-
- // extract date from description
- $description_split =
- explode(
- "—", $description, 2
- );
-
- if(count($description_split) === 1){
-
- $description = $description_split[0];
- }elseif(strlen($description_split[0]) < 17){
-
- $date = strtotime($description_split[0]);
-
- if($date !== false){
-
- $description = $description_split[1];
- }else{
-
- $date = null;
- }
- }
-
- $out["web"][] = [
- "title" => $this->titledots($title),
- "description" => $this->titledots($description),
- "url" => $link,
- "date" => $date,
- "type" => "web",
- "thumb" => $thumb,
- "sublink" => $sublinks,
- "table" => $table
- ];
- }
-
- // get next page
- if(count($out["web"]) > 5){
-
- $params["start"] += 10;
-
- $out["npt"] =
- $this->backend->store(
- json_encode($params),
- "web",
- $proxy
- );
- }
-
- return $out;
+ throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers.");
}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage