diff options
| author | lolcat <will@lolcat.ca> | 2025-10-08 00:42:36 -0400 |
|---|---|---|
| committer | lolcat <will@lolcat.ca> | 2025-10-08 00:42:36 -0400 |
| commit | a4a44709b4ee1dffaca8b7f79b3c0814914a58f7 (patch) | |
| tree | 1105658aaafdaec94205a3d1c032ba967fd5ac20 | |
| parent | 4b16fd58971321e6938046702210e2773573c621 (diff) | |
google quote on quote fix
| -rw-r--r-- | data/api_keys/google_api.txt | 8 | ||||
| -rw-r--r-- | lib/backend.php | 37 | ||||
| -rw-r--r-- | lib/frontend.php | 3 | ||||
| -rw-r--r-- | scraper/google.php | 461 | ||||
| -rw-r--r-- | scraper/google_api.php | 739 | ||||
| -rw-r--r-- | settings.php | 4 |
6 files changed, 789 insertions, 463 deletions
diff --git a/data/api_keys/google_api.txt b/data/api_keys/google_api.txt new file mode 100644 index 0000000..5548d21 --- /dev/null +++ b/data/api_keys/google_api.txt @@ -0,0 +1,8 @@ +# Specify API keys for the Google API in the following format: +# <key> +# +# Generate keys here: +# https://developers.google.com/custom-search/v1/overview +# Make sure to use a different Google account for each key, cause I'm +# pretty sure the ratelimit is on a per-account basis :P +# diff --git a/lib/backend.php b/lib/backend.php index 66e78a1..68ac270 100644 --- a/lib/backend.php +++ b/lib/backend.php @@ -9,7 +9,7 @@ class backend{ /* Proxy stuff */ - public function get_ip(){ + public function get_ip($proxy_index_raw = null){ $pool = constant("config::PROXY_" . strtoupper($this->scraper)); if($pool === false){ @@ -19,7 +19,10 @@ class backend{ } // indent - $proxy_index_raw = apcu_inc("p." . $this->scraper); + if($proxy_index_raw === null){ + + $proxy_index_raw = apcu_inc("p." . $this->scraper); + } $proxylist = file_get_contents("data/proxies/" . $pool . ".txt"); $proxylist = explode("\n", $proxylist); @@ -32,6 +35,12 @@ class backend{ $proxylist = array_values($proxylist); + if(count($proxylist) === 0){ + + throw new Exception("A proxy list was specified but it's empty!"); + } + + //echo $proxylist[$proxy_index_raw % count($proxylist)]; return $proxylist[$proxy_index_raw % count($proxylist)]; } @@ -88,6 +97,30 @@ class backend{ } } + // API key rotation + public function get_key(){ + + $keys = file_get_contents("data/api_keys/" . $this->scraper . ".txt"); + $keys = explode("\n", $keys); + + $keys = array_filter($keys, function($entry){ + $entry = ltrim($entry); + return strlen($entry) > 0 && substr($entry, 0, 1) != "#"; + }); + + $keys = array_values($keys); + + if(count($keys) === 0){ + + throw new Exception("Please specify API keys in data/api_keys/" . $this->scraper . ".txt"); + } + + $increment = apcu_inc("s." . $this->scraper) % count($keys); + return [ + "key" => $keys[$increment], + "increment" => $increment + ]; + } /* diff --git a/lib/frontend.php b/lib/frontend.php index dfa8b0b..eb87df7 100644 --- a/lib/frontend.php +++ b/lib/frontend.php @@ -937,11 +937,12 @@ class frontend{ "display" => "Scraper", "option" => [ "ddg" => "DuckDuckGo", + "yahoo" => "Yahoo!", "brave" => "Brave", "mullvad_brave" => "Mullvad (Brave)", "yandex" => "Yandex", "google" => "Google", - //"google_api" => "Google API", + "google_api" => "Google API", "google_cse" => "Google CSE", "mullvad_google" => "Mullvad (Google)", "startpage" => "Startpage", diff --git a/scraper/google.php b/scraper/google.php index c83b084..049c844 100644 --- a/scraper/google.php +++ b/scraper/google.php @@ -561,466 +561,7 @@ class google{ public function web($get){ - if($get["npt"]){ - - [$params, $proxy] = $this->backend->get($get["npt"], "web"); - - $params = json_decode($params, true); - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - }else{ - $search = $get["s"]; - $country = $get["country"]; - $nsfw = $get["nsfw"]; - $lang = $get["lang"]; - $older = $get["older"]; - $newer = $get["newer"]; - $spellcheck = $get["spellcheck"]; - $proxy = $this->backend->get_ip(); - - $offset = 0; - - /* - https://www.google.com/search?udm=14&yv=3&q=asmr&biw=1920&bih=947&start=0&sa=N&asearch=arc&cs=1&async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - - https://www.google.com/search?udm=14& - yv=3& - q=asmr& - biw=1920& - bih=947& - start=0& - sa=N& - asearch=arc& - cs=1& - async=arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc - */ - - $params = [ - "udm" => 14, - "yv" => 3, - "q" => $search, - "biw" => 1920, - "bih" => 947, - "start" => 0, - "sa" => "N", - "asearch" => "arc", - "cs" => 1, - "async" => "arc_id:srp_s_110,ffilt:all,ve_name:MoreResultsContainer,use_ac:false,inf:1,_id:arc-srp_s_110,_pms:s,_fmt:pc", - "hl" => "en", - "num" => 20 - ]; - - // country - if($country != "any"){ - - $params["gl"] = $country; - } - - // nsfw - $params["safe"] = $nsfw == "yes" ? "off" : "active"; - - // language - if($lang != "any"){ - - $params["lr"] = "lang_" . $lang; - } - - // generate tbs - $tbs = []; - - // get date - $older = $older === false ? null : date("m/d/Y", $older); - $newer = $newer === false ? null : date("m/d/Y", $newer); - - if( - $older !== null || - $newer !== null - ){ - - $tbs["cdr"] = "1"; - $tbs["cd_min"] = $newer; - $tbs["cd_max"] = $older; - } - - // spellcheck filter - if($spellcheck == "no"){ - - $params["nfpr"] = "1"; - } - - if(count($tbs) !== 0){ - - $params["tbs"] = ""; - - foreach($tbs as $key => $value){ - - $params["tbs"] .= $key . ":" . $value . ","; - } - - $params["tbs"] = rtrim($params["tbs"], ","); - } - - try{ - $json = - $this->get( - $proxy, - "https://www.google.com/search", - $params - ); - }catch(Exception $error){ - - throw new Exception("Failed to get HTML"); - } - - //$json = file_get_contents("scraper/google.js"); - } - - $out = [ - "status" => "ok", - "spelling" => [ - "type" => "no_correction", - "using" => null, - "correction" => null - ], - "npt" => null, - "answer" => [], - "web" => [], - "image" => [], - "video" => [], - "news" => [], - "related" => [] - ]; - - $this->fuckhtml->load($json); - $this->detect_sorry(); - - // get next page - /* - $npt = - $this->fuckhtml - ->getElementsByAttributeName( - "data-state-token", - "div" - ); - - if(count($npt) !== 0){ - - $params["sstk"] = - $this->fuckhtml - ->getTextContent( - $npt[0]["attributes"]["data-state-token"] - ); - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - }*/ - - // get invididual results - $results = - $this->fuckhtml - ->getElementsByAttributeName( - "data-hveid", - "div" - ); - - foreach($results as $result){ - - $this->fuckhtml->load($result); - - //echo $result["innerHTML"]; - - $snfs = - $this->fuckhtml - ->getElementsByAttributeName( - "data-snf", - "div" - ); - - $title = null; - $description = null; - $link = null; - $sublinks = []; - $date = null; - $thumb = [ - "ratio" => null, - "url" => null - ]; - $table = []; - - // probe for title - $title_node = - $this->fuckhtml - ->getElementsByTagName( - "h3" - ); - - if(count($title_node) !== 0){ - - // found a title node - $title = - $this->fuckhtml - ->getTextContent( - $title_node[0] - ); - } - - if($title === null){ - - // should not happen - continue; - } - - foreach($snfs as $snf){ - - $this->fuckhtml->load($snf); - - // probe for thumbnail - $thumbnail = - $this->fuckhtml - ->getElementsByAttributeName( - "alt", - "img" - ); - - foreach($thumbnail as $t){ - - if( - isset($t["attributes"]["style"]) && - preg_match( - '/height ?: ?([0-9]+)px/', - $t["attributes"]["style"], - $match - ) && - (int)$match[1] < 40 - ){ - - // found a favicon, ignore - continue; - } - - $thumb = [ - "ratio" => "1:1", - "url" => - $this->fuckhtml - ->getTextContent( - $thumbnail[0]["attributes"]["src"] - ) - ]; - - continue 2; - } - - // probe for description - if($description === null){ - - // probe 1 - if( - isset($snf["attributes"]["data-sncf"]) && - $snf["attributes"]["data-sncf"] == "1,2" - ){ - - $description = - $this->fuckhtml - ->getTextContent( - $snf - ); - continue; - } - - // probe 2 - $desc_probe = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "-webkit-line-clamp:2", - "div" - ); - - if(count($desc_probe) !== 0){ - - $description = - $this->fuckhtml - ->getTextContent( - $desc_probe[0] - ); - continue; - } - } - - // probe for links - $links = - $this->fuckhtml - ->getElementsByAttributeName( - "data-sb", - "a" - ); - - if(isset($links[0]["attributes"]["data-ved"])){ - - // found the page link - $link = - $this->fuckhtml - ->getTextContent( - $links[0]["attributes"]["href"] - ); - - continue; - } - - if(count($links) !== 0){ - - // get all sublinks - for($i=0; $i<count($links); $i++){ - - $sublinks[] = [ - "title" => - $this->titledots( - $this->fuckhtml - ->getTextContent( - $links[$i] - ) - ), - "description" => null, - "date" => null, - "url" => - $this->fuckhtml - ->getTextContent( - $links[$i]["attributes"]["href"] - ) - ]; - } - - continue; - } - - // get tabloid-able data - $tabloid = - $this->fuckhtml - ->getElementsByAttributeValue( - "style", - "margin-top:0px", - "div" - ); - - if(count($tabloid) === 0){ - - // try getting <cite> instead - $tabloid = - $this->fuckhtml - ->getElementsByTagName( - "cite" - ); - } - - if(count($tabloid) !== 0){ - - // found table - $tabloid = - explode("·", $tabloid[0]["innerHTML"]); - - foreach($tabloid as $tbl){ - - $preg = - $this->fuckhtml - ->getTextContent( - $tbl - ); - - //$table[random_int(0,1000)] = $preg; - - if( - // match price - preg_match( - '/(\p{Sc}[^\p{Sc}]+)/', - $preg, - $match - ) - ){ - - $table["Price"] = trim($match[1]); - } - - if( - // match in stock/delivery - preg_match( - '/(stock|delivery|returns)/i', - $preg, - $match - ) - ){ - - $table[ucfirst($match[1])] = trim($preg, " \t\n\r\0\x0B\xC2\xA0"); - } - } - continue; - } - } - - // extract date from description - $description_split = - explode( - "—", $description, 2 - ); - - if(count($description_split) === 1){ - - $description = $description_split[0]; - }elseif(strlen($description_split[0]) < 17){ - - $date = strtotime($description_split[0]); - - if($date !== false){ - - $description = $description_split[1]; - }else{ - - $date = null; - } - } - - $out["web"][] = [ - "title" => $this->titledots($title), - "description" => $this->titledots($description), - "url" => $link, - "date" => $date, - "type" => "web", - "thumb" => $thumb, - "sublink" => $sublinks, - "table" => $table - ]; - } - - // get next page - if(count($out["web"]) > 5){ - - $params["start"] += 10; - - $out["npt"] = - $this->backend->store( - json_encode($params), - "web", - $proxy - ); - } - - return $out; + throw new Exception("Google made it impossible to scrape web results without a JavaScript runtime. In the meantime, use the Google API or the Google CSE scrapers."); } diff --git a/scraper/google_api.php b/scraper/google_api.php new file mode 100644 index 0000000..899726b --- /dev/null +++ b/scraper/google_api.php @@ -0,0 +1,739 @@ +<?php + +// @TODO check for consent.google.com page, if need be + +class google_api{ + + public function __construct(){ + + include "lib/backend.php"; + $this->backend = new backend("google_api"); + } + + public function getfilters($page){ + + $base = [ + "country" => [ // gl=<country> (image: cr=countryAF) + "display" => "Country", + "option" => [ + "any" => "Instance's country", + "af" => "Afghanistan", + "al" => "Albania", + "dz" => "Algeria", + "as" => "American Samoa", + "ad" => "Andorra", + "ao" => "Angola", + "ai" => "Anguilla", + "aq" => "Antarctica", + "ag" => "Antigua and Barbuda", + "ar" => "Argentina", + "am" => "Armenia", + "aw" => "Aruba", + "au" => "Australia", + "at" => "Austria", + "az" => "Azerbaijan", + "bs" => "Bahamas", + "bh" => "Bahrain", + "bd" => "Bangladesh", + "bb" => "Barbados", + "by" => "Belarus", + "be" => "Belgium", + "bz" => "Belize", + "bj" => "Benin", + "bm" => "Bermuda", + "bt" => "Bhutan", + "bo" => "Bolivia", + "ba" => "Bosnia and Herzegovina", + "bw" => "Botswana", + "bv" => "Bouvet Island", + "br" => "Brazil", + "io" => "British Indian Ocean Territory", + "bn" => "Brunei Darussalam", + "bg" => "Bulgaria", + "bf" => "Burkina Faso", + "bi" => "Burundi", + "kh" => "Cambodia", + "cm" => "Cameroon", + "ca" => "Canada", + "cv" => "Cape Verde", + "ky" => "Cayman Islands", + "cf" => "Central African Republic", + "td" => "Chad", + "cl" => "Chile", + "cn" => "China", + "cx" => "Christmas Island", + "cc" => "Cocos (Keeling) Islands", + "co" => "Colombia", + "km" => "Comoros", + "cg" => "Congo", + "cd" => "Congo, the Democratic Republic", + "ck" => "Cook Islands", + "cr" => "Costa Rica", + "ci" => "Cote D'ivoire", + "hr" => "Croatia", + "cu" => "Cuba", + "cy" => "Cyprus", + "cz" => "Czech Republic", + "dk" => "Denmark", + "dj" => "Djibouti", + "dm" => "Dominica", + "do" => "Dominican Republic", + "ec" => "Ecuador", + "eg" => "Egypt", + "sv" => "El Salvador", + "gq" => "Equatorial Guinea", + "er" => "Eritrea", + "ee" => "Estonia", + "et" => "Ethiopia", + "fk" => "Falkland Islands (Malvinas)", + "fo" => "Faroe Islands", + "fj" => "Fiji", + "fi" => "Finland", + "fr" => "France", + "gf" => "French Guiana", + "pf" => "French Polynesia", + "tf" => "French Southern Territories", + "ga" => "Gabon", + "gm" => "Gambia", + "ge" => "Georgia", + "de" => "Germany", + "gh" => "Ghana", + "gi" => "Gibraltar", + "gr" => "Greece", + "gl" => "Greenland", + "gd" => "Grenada", + "gp" => "Guadeloupe", + "gu" => "Guam", + "gt" => "Guatemala", + "gn" => "Guinea", + "gw" => "Guinea-Bissau", + "gy" => "Guyana", + "ht" => "Haiti", + "hm" => "Heard Island and Mcdonald Islands", + "va" => "Holy See (Vatican City State)", + "hn" => "Honduras", + "hk" => "Hong Kong", + "hu" => "Hungary", + "is" => "Iceland", + "in" => "India", + "id" => "Indonesia", + "ir" => "Iran, Islamic Republic", + "iq" => "Iraq", + "ie" => "Ireland", + "il" => "Israel", + "it" => "Italy", + "jm" => "Jamaica", + "jp" => "Japan", + "jo" => "Jordan", + "kz" => "Kazakhstan", + "ke" => "Kenya", + "ki" => "Kiribati", + "kp" => "Korea, Democratic People's Republic", + "kr" => "Korea, Republic", + "kw" => "Kuwait", + "kg" => "Kyrgyzstan", + "la" => "Lao People's Democratic Republic", + "lv" => "Latvia", + "lb" => "Lebanon", + "ls" => "Lesotho", + "lr" => "Liberia", + "ly" => "Libyan Arab Jamahiriya", + "li" => "Liechtenstein", + "lt" => "Lithuania", + "lu" => "Luxembourg", + "mo" => "Macao", + "mk" => "Macedonia, the Former Yugosalv Republic", + "mg" => "Madagascar", + "mw" => "Malawi", + "my" => "Malaysia", + "mv" => "Maldives", + "ml" => "Mali", + "mt" => "Malta", + "mh" => "Marshall Islands", + "mq" => "Martinique", + "mr" => "Mauritania", + "mu" => "Mauritius", + "yt" => "Mayotte", + "mx" => "Mexico", + "fm" => "Micronesia, Federated States", + "md" => "Moldova, Republic", + "mc" => "Monaco", + "mn" => "Mongolia", + "ms" => "Montserrat", + "ma" => "Morocco", + "mz" => "Mozambique", + "mm" => "Myanmar", + "na" => "Namibia", + "nr" => "Nauru", + "np" => "Nepal", + "nl" => "Netherlands", + "an" => "Netherlands Antilles", + "nc" => "New Caledonia", + "nz" => "New Zealand", + "ni" => "Nicaragua", + "ne" => "Niger", + "ng" => "Nigeria", + "nu" => "Niue", + "nf" => "Norfolk Island", + "mp" => "Northern Mariana Islands", + "no" => "Norway", + "om" => "Oman", + "pk" => "Pakistan", + "pw" => "Palau", + "ps" => "Palestinian Territory, Occupied", + "pa" => "Panama", + "pg" => "Papua New Guinea", + "py" => "Paraguay", + "pe" => "Peru", + "ph" => "Philippines", + "pn" => "Pitcairn", + "pl" => "Poland", + "pt" => "Portugal", + "pr" => "Puerto Rico", + "qa" => "Qatar", + "re" => "Reunion", + "ro" => "Romania", + "ru" => "Russian Federation", + "rw" => "Rwanda", + "sh" => "Saint Helena", + "kn" => "Saint Kitts and Nevis", + "lc" => "Saint Lucia", + "pm" => "Saint Pierre and Miquelon", + "vc" => "Saint Vincent and the Grenadines", + "ws" => "Samoa", + "sm" => "San Marino", + "st" => "Sao Tome and Principe", + "sa" => "Saudi Arabia", + "sn" => "Senegal", + "cs" => "Serbia and Montenegro", + "sc" => "Seychelles", + "sl" => "Sierra Leone", + "sg" => "Singapore", + "sk" => "Slovakia", + "si" => "Slovenia", + "sb" => "Solomon Islands", + "so" => "Somalia", + "za" => "South Africa", + "gs" => "South Georgia and the South Sandwich Islands", + "es" => "Spain", + "lk" => "Sri Lanka", + "sd" => "Sudan", + "sr" => "Suriname", + "sj" => "Svalbard and Jan Mayen", + "sz" => "Swaziland", + "se" => "Sweden", + "ch" => "Switzerland", + "sy" => "Syrian Arab Republic", + "tw" => "Taiwan, Province of China", + "tj" => "Tajikistan", + "tz" => "Tanzania, United Republic", + "th" => "Thailand", + "tl" => "Timor-Leste", + "tg" => "Togo", + "tk" => "Tokelau", + "to" => "Tonga", + "tt" => "Trinidad and Tobago", + "tn" => "Tunisia", + "tr" => "Turkey", + "tm" => "Turkmenistan", + "tc" => "Turks and Caicos Islands", + "tv" => "Tuvalu", + "ug" => "Uganda", + "ua" => "Ukraine", + "ae" => "United Arab Emirates", + "uk" => "United Kingdom", + "us" => "United States", + "um" => "United States Minor Outlying Islands", + "uy" => "Uruguay", + "uz" => "Uzbekistan", + "vu" => "Vanuatu", + "ve" => "Venezuela", + "vn" => "Viet Nam", + "vg" => "Virgin Islands, British", + "vi" => "Virgin Islands, U.S.", + "wf" => "Wallis and Futuna", + "eh" => "Western Sahara", + "ye" => "Yemen", + "zm" => "Zambia", + "zw" => "Zimbabwe" + ] + ], + "nsfw" => [ + "display" => "NSFW", + "option" => [ + "yes" => "Yes", // safe=active + "no" => "No" // safe=off + ] + ] + ]; + + switch($page){ + + case "web": + return array_merge( + $base, + [ + "lang" => [ // lr=<lang> (prefix lang with "lang_") + "display" => "Language", + "option" => [ + "any" => "Any language", + "ar" => "Arabic", + "bg" => "Bulgarian", + "ca" => "Catalan", + "cs" => "Czech", + "da" => "Danish", + "de" => "German", + "el" => "Greek", + "en" => "English", + "es" => "Spanish", + "et" => "Estonian", + "fi" => "Finnish", + "fr" => "French", + "hr" => "Croatian", + "hu" => "Hungarian", + "id" => "Indonesian", + "is" => "Icelandic", + "it" => "Italian", + "iw" => "Hebrew", + "ja" => "Japanese", + "ko" => "Korean", + "lt" => "Lithuanian", + "lv" => "Latvian", + "nl" => "Dutch", + "no" => "Norwegian", + "pl" => "Polish", + "pt" => "Portuguese", + "ro" => "Romanian", + "ru" => "Russian", + "sk" => "Slovak", + "sl" => "Slovenian", + "sr" => "Serbian", + "sv" => "Swedish", + "tr" => "Turkish", + "zh-CN" => "Chinese (Simplified)", + "zh-TW" => "Chinese (Traditional)" + ] + ], + "sort" => [ + "display" => "Sort by", + "option" => [ + "any" => "Any order", + "date:d" => "Oldest", + "date:a" => "Newest" + ] + ], + "newer" => [ + "display" => "Newer than", + "option" => "_DATE" + ], + "rm_dupes" => [ + "display" => "Remove duplicates", + "option" => [ + "yes" => "Yes", + "no" => "No" + ] + ] + ] + ); + break; + /* + case "images": + return array_merge( + $base, + [ + "time" => [ // tbs=qdr:<time> + "display" => "Time posted", + "option" => [ + "any" => "Any time", + "d" => "Past 24 hours", + "w" => "Past week", + "m" => "Past month", + "y" => "Past year" + ] + ], + "size" => [ // imgsz + "display" => "Size", + "option" => [ + "any" => "Any size", + "l" => "Large", + "m" => "Medium", + "i" => "Icon", + "qsvga" => "Larger than 400x300", + "vga" => "Larger than 640x480", + "svga" => "Larger than 800x600", + "xga" => "Larger than 1024x768", + "2mp" => "Larger than 2MP", + "4mp" => "Larger than 4MP", + "6mp" => "Larger than 6MP", + "8mp" => "Larger than 8MP", + "10mp" => "Larger than 10MP", + "12mp" => "Larger than 12MP", + "15mp" => "Larger than 15MP", + "20mp" => "Larger than 20MP", + "40mp" => "Larger than 40MP", + "70mp" => "Larger than 70MP" + ] + ], + "ratio" => [ // imgar + "display" => "Aspect ratio", + "option" => [ + "any" => "Any ratio", + "t|xt" => "Tall", + "s" => "Square", + "w" => "Wide", + "xw" => "Panoramic" + ] + ], + "color" => [ // imgc + "display" => "Color", + "option" => [ + "any" => "Any color", + "color" => "Full color", + "bnw" => "Black & white", + "trans" => "Transparent", + // from here, imgcolor + "red" => "Red", + "orange" => "Orange", + "yellow" => "Yellow", + "green" => "Green", + "teal" => "Teal", + "blue" => "Blue", + "purple" => "Purple", + "pink" => "Pink", + "white" => "White", + "gray" => "Gray", + "black" => "Black", + "brown" => "Brown" + ] + ], + "type" => [ // tbs=itp:<type> + "display" => "Type", + "option" => [ + "any" => "Any type", + "clipart" => "Clip Art", + "lineart" => "Line Drawing", + "animated" => "Animated" + ] + ], + "format" => [ // as_filetype + "display" => "Format", + "option" => [ + "any" => "Any format", + "jpg" => "JPG", + "gif" => "GIF", + "png" => "PNG", + "bmp" => "BMP", + "svg" => "SVG", + "webp" => "WEBP", + "ico" => "ICO", + "craw" => "RAW" + ] + ], + "rights" => [ // tbs=sur:<rights> + "display" => "Usage rights", + "option" => [ + "any" => "Any license", + "cl" => "Creative Commons licenses", + "ol" => "Commercial & other licenses" + ] + ] + ] + ); + break;*/ + } + } + + private function get($proxy, $url, $get = []){ + + $curlproc = curl_init(); + + $headers = [ + "Accept: application/json", + "Accept-Encoding: gzip" + ]; + + if($get !== []){ + $get = http_build_query($get); + $url .= "?" . $get; + } + + curl_setopt($curlproc, CURLOPT_URL, $url); + + curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding + curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); + + curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); + curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); + curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); + curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); + + // follow redirects + curl_setopt($curlproc, CURLOPT_FOLLOWLOCATION, true); + + $this->backend->assign_proxy($curlproc, $proxy); + + $data = curl_exec($curlproc); + + if(curl_errno($curlproc)){ + + throw new Exception(curl_error($curlproc)); + } + + curl_close($curlproc); + + return $data; + } + + public function web($get){ + + // rotate proxy + key on EVERY request + $keydata = $this->backend->get_key(); + $proxy = $this->backend->get_ip($keydata["increment"]); + + if($get["npt"]){ + + // $p is never used + [$params, $p] = $this->backend->get( + $get["npt"], + "web" + ); + + $params = json_decode($params, true); + + $params["key"] = $keydata["key"]; + + }else{ + + //$json = file_get_contents("scraper/google.json"); + $params = [ + "q" => $get["s"], + "cx" => config::GOOGLE_CX_ENDPOINT, + "num" => 10, + "start" => 1, + "key" => $keydata["key"] + ]; + + // + // parse filters + // + if($get["newer"] !== false){ + + $params["dateRestrict"] = "d" . (round((time() - $get["newer"]) / 100000)); + } + + if($get["rm_dupes"] == "no"){ $params["filter"] = "0"; } + if($get["country"] != "any"){ $params["gl"] = $get["country"]; } + if($get["lang"] != "any"){ $params["lr"] = "lang_" . $get["lang"]; } + + if($get["nsfw"] == "yes"){ + + $params["safe"] = "off"; + }else{ + + $params["safe"] = "active"; + } + + if($get["sort"] != "any"){ $params["sort"] = $get["sort"]; } + } + + try{ + $json = + $this->get( + $proxy, + "https://www.googleapis.com/customsearch/v1", + $params + ); + }catch(Exception $error){ + + throw new Exception("Failed to fetch JSON"); + } + + $json = json_decode($json, true); + + if($json === null){ + + throw new Exception("Failed to decode JSON"); + } + + $out = [ + "status" => "ok", + "spelling" => [ + "type" => "no_correction", + "using" => null, + "correction" => null + ], + "npt" => null, + "answer" => [], + "web" => [], + "image" => [], + "video" => [], + "news" => [], + "related" => [] + ]; + + if(isset($json["error"]["message"])){ + + throw new Exception( + "API returned an error: " . + $json["error"]["message"] . + " (key #" . $keydata["increment"] . ")" + ); + } + + if(!isset($json["items"])){ + + throw new Exception("Failed to access items array"); + } + + foreach($json["items"] as $result){ + + // + // probe for thumbnail + // + $probes = [ + isset($result["pagemap"]["cse_thumbnail"][0]["src"]) ? $result["pagemap"]["cse_thumbnail"][0]["src"] : null, + isset($result["pagemap"]["cse_image"][0]["src"]) ? $result["pagemap"]["cse_image"][0]["src"] : null, + isset($result["pagemap"]["metatags"][0]["twitter:image"]) ? $result["pagemap"]["metatags"][0]["twitter:image"] : null, + isset($result["pagemap"]["metatags"][0]["og:image"]) ? $result["pagemap"]["metatags"][0]["og:image"] : null + ]; + + $thumb = [ + "url" => null, + "ratio" => null + ]; + + foreach($probes as $probe){ + + if($probe !== null){ + + $thumb = [ + "url" => $probe, + "ratio" => "16:9" + ]; + break; + } + } + + // + // probe for page format + // + $mime = "web"; + if(isset($result["mime"])){ + + $result["mime"] = + explode( + "/", + $result["mime"], + 2 + ); + + if(count($result["mime"]) === 2){ + + $mime = strtoupper($result["mime"][1]); + } + } + + $description = $result["snippet"]; + + // + // Get date + // + $description_split = + explode( + "...", $description, 2 + ); + + if(count($description_split) === 1){ + + $description = $result["snippet"]; + }elseif(strlen($description_split[0]) < 17){ + + $date = trim($description_split[0]); + $date_probe = strtotime($date); + + if($date_probe !== false){ + + $description = $description_split[1]; + }else{ + + // + // fallback to getting date from meta tags + // + if(isset($result["pagemap"]["metatags"][0]["creationdate"])){ + + $date = $result["pagemap"]["metatags"][0]["creationdate"]; + + }elseif(isset($result["pagemap"]["metatags"][0]["moddate"])){ + + $date = $result["pagemap"]["metatags"][0]["moddate"]; + }else{ + + $date = null; + } + + $description = $result["snippet"]; + } + } + + if($date !== null){ + + $date = + strtotime( + trim( + str_replace( + ["D:", "'"], + "", + $date + ) + ) + ); + + if($date === false){ + + $date = null; + } + } + + $out["web"][] = [ + "title" => + $this->titledots( + $result["title"] + ), + "description" => + $this->titledots( + $description + ), + "url" => $result["link"], + "date" => $date, + "type" => $mime, + "thumb" => $thumb, + "sublink" => [], + "table" => [] + ]; + } + + // get npt + if(isset($json["queries"]["nextPage"][0]["startIndex"])){ + + $filters["start"] = (int)$json["queries"]["nextPage"][0]["startIndex"]; + + unset($params["key"]); + $params["start"] += 10; + + $out["npt"] = + $this->backend->store( + json_encode($params), + "web", + $proxy + ); + } + + return $out; + } + + private function titledots($title){ + + return trim($title, " .\t\n\r\0\x0B…"); + } +} diff --git a/settings.php b/settings.php index d680d52..b1949f6 100644 --- a/settings.php +++ b/settings.php @@ -138,6 +138,10 @@ $settings = [ "text" => "Google" ], [ + "value" => "google_api", + "text" => "Google API" + ], + [ "value" => "google_cse", "text" => "Google CSE" ], |
