aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--scraper/greppr.php328
1 files changed, 164 insertions, 164 deletions
diff --git a/scraper/greppr.php b/scraper/greppr.php
index 2f425a6..fc8511c 100644
--- a/scraper/greppr.php
+++ b/scraper/greppr.php
@@ -16,49 +16,82 @@ class greppr{
return [];
}
- private function get($proxy, $url, $get = [], $cookie = false){
+ private function get($proxy, $url, $get = [], $cookie = false, $post){
$curlproc = curl_init();
- if($get !== []){
- $get = http_build_query($get);
- $url .= "?" . $get;
- }
-
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- if($cookie === false){
+ if($post === false){
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
- );
+ if($cookie === false){
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "DNT: 1",
+ "Connection: keep-alive",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: none",
+ "Sec-Fetch-User: ?1"]
+ );
+ }else{
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Connection: keep-alive",
+ "Referer: https://greppr.org/search",
+ "Cookie: PHPSESSID=$cookie",
+ "Upgrade-Insecure-Requests: 1",
+ "Sec-Fetch-Dest: document",
+ "Sec-Fetch-Mode: navigate",
+ "Sec-Fetch-Site: same-origin",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i"]
+ );
+ }
}else{
+ $get = http_build_query($get);
+
+ curl_setopt($curlproc, CURLOPT_POST, true);
+ curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
+
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
+ "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "Cookie: PHPSESSID=" . $cookie,
+ "Accept-Encoding: gzip, deflate, br, zstd",
+ "Content-Type: application/x-www-form-urlencoded",
+ "Content-Length: " . strlen($get),
+ "Origin: https://greppr.org",
"DNT: 1",
+ "Sec-GPC: 1",
"Connection: keep-alive",
+ "Referer: https://greppr.org/",
+ "Cookie: PHPSESSID=$cookie",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
+ "Sec-Fetch-Site: same-origin",
+ "Sec-Fetch-User: ?1",
+ "Priority: u=0, i"]
);
}
@@ -113,7 +146,24 @@ class greppr{
[$q, $proxy] = $this->backend->get($get["npt"], "web");
- $q = json_decode($q, true);
+ $tokens = json_decode($q, true);
+
+ //
+ // Get paginated page
+ //
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://greppr.org" . $tokens["get"],
+ [],
+ $tokens["cookie"],
+ false
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
}else{
@@ -124,88 +174,114 @@ class greppr{
}
$proxy = $this->backend->get_ip();
- }
-
- // get token
- // token[0] = static token that changes once a day
- // token[1] = dynamic token that changes on every request
- // token[1] = PHPSESSID cookie
- $tokens = apcu_fetch("greppr_token");
-
- if(
- $tokens === false ||
- $first_attempt === false // force token fetch
- ){
- // we haven't gotten the token yet, get it
+ //
+ // get token
+ //
try{
- $response =
+ $html =
$this->get(
$proxy,
"https://greppr.org",
- []
+ [],
+ false,
+ false
);
}catch(Exception $error){
throw new Exception("Failed to fetch search tokens");
}
- $tokens = $this->parse_token($response);
-
- if($tokens === false){
-
- throw new Exception("Failed to grep search tokens");
- }
- }
+ //
+ // Parse token
+ //
+ $this->fuckhtml->load($html["data"]);
- try{
+ $tokens = [];
- if($get["npt"]){
+ $inputs =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "input"
+ );
- $params = [
- $tokens[0] => $q["q"],
- "s" => $q["s"],
- "l" => 30,
- "n" => $tokens[1]
- ];
- }else{
+ foreach($inputs as $input){
+
+ if(!isset($input["attributes"]["name"])){
+
+ continue;
+ }
- $params = [
- $tokens[0] => $search,
- "n" => $tokens[1]
- ];
+ switch($input["attributes"]["name"]){
+
+ case "var1":
+ case "var2":
+ case "n":
+ $tokens[$input["attributes"]["name"]] =
+ $this->fuckhtml
+ ->getTextContent(
+ $input["attributes"]["value"]
+ );
+ break;
+
+ default:
+ $tokens["req"] =
+ $this->fuckhtml
+ ->getTextContent(
+ $input["attributes"]["name"]
+ );
+ break;
+ }
}
- $searchresults = $this->get(
- $proxy,
- "https://greppr.org/search",
- $params,
- $tokens[2]
+ // get cookie
+ preg_match(
+ '/PHPSESSID=([^;]+)/',
+ $html["headers"]["set-cookie"],
+ $cookie
);
- }catch(Exception $error){
- throw new Exception("Failed to fetch search page");
- }
-
- if(strlen($searchresults["data"]) === 0){
+ if(!isset($cookie[1])){
+
+ // server sent an unexpected cookie
+ throw new Exception("Got malformed cookie");
+ }
- // redirected to main page, which means we got old token
- // generate a new one
+ $tokens["cookie"] = $cookie[1];
- // ... unless we just tried to do that
- if($first_attempt === false){
+ if($tokens === false){
- throw new Exception("Failed to get a new search token");
+ throw new Exception("Failed to grep search tokens");
}
- return $this->web($get, false);
+ //
+ // Get initial search page
+ //
+ try{
+
+ $html = $this->get(
+ $proxy,
+ "https://greppr.org/search",
+ [
+ "var1" => $tokens["var1"],
+ "var2" => $tokens["var2"],
+ $tokens["req"] => $search,
+ "n" => $tokens["n"]
+ ],
+ $tokens["cookie"],
+ true
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch search page");
+ }
}
- // refresh the token with new data (this also triggers fuckhtml load)
- $this->parse_token($searchresults, $tokens[2]);
+ //$html = file_get_contents("scraper/greppr.html");
+ //$this->fuckhtml->load($html);
+ $this->fuckhtml->load($html["data"]);
- // response object
$out = [
"status" => "ok",
"spelling" => [
@@ -254,24 +330,16 @@ class greppr{
if($break === true){
- parse_str(
- $this->fuckhtml
- ->getTextContent(
- $a["attributes"]["href"]
- ),
- $values
- );
-
- $values = array_values($values);
-
$out["npt"] =
$this->backend->store(
- json_encode(
- [
- "q" => $values[0],
- "s" => $values[1]
- ]
- ),
+ json_encode([
+ "get" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $a["attributes"]["href"]
+ ),
+ "cookie" => $tokens["cookie"]
+ ]),
"web",
$proxy
);
@@ -360,74 +428,6 @@ class greppr{
return $out;
}
- private function parse_token($response, $cookie = false){
-
- $this->fuckhtml->load($response["data"]);
-
- $scripts =
- $this->fuckhtml
- ->getElementsByTagName("script");
-
- $found = false;
- foreach($scripts as $script){
-
- preg_match(
- '/window\.location ?= ?\'\/search\?([^=]+).*&n=([0-9]+)/',
- $script["innerHTML"],
- $tokens
- );
-
- if(isset($tokens[1])){
-
- $found = true;
- break;
- }
- }
-
- if($found === false){
-
- return false;
- }
-
- $tokens = [
- $tokens[1],
- $tokens[2]
- ];
-
- if($cookie !== false){
-
- // we already specified a cookie, so use the one we have already
- $tokens[] = $cookie;
- apcu_store("greppr_token", $tokens);
-
- return $tokens;
- }
-
- if(!isset($response["headers"]["set-cookie"])){
-
- // server didn't send a cookie
- return false;
- }
-
- // get cookie
- preg_match(
- '/PHPSESSID=([^;]+)/',
- $response["headers"]["set-cookie"],
- $cookie
- );
-
- if(!isset($cookie[1])){
-
- // server sent an unexpected cookie
- return false;
- }
-
- $tokens[] = $cookie[1];
- apcu_store("greppr_token", $tokens);
-
- return $tokens;
- }
-
private function limitstrlen($text){
return explode("\n", wordwrap($text, 300, "\n"))[0];
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage