aboutsummaryrefslogtreecommitdiffstats
path: root/scraper
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-01-06 23:56:49 -0500
committerlolcat <will@lolcat.ca>2025-01-06 23:56:49 -0500
commit463ba0775f9c5fd60c3e4bf3fddb70028de550c7 (patch)
tree0c7c6a3c2efe89a731cccdcc5e54c2f517c2df00 /scraper
parentcfad4fb035eb608e4cbd24cf955cdd455fa983f6 (diff)
added pinterest
Diffstat (limited to 'scraper')
-rw-r--r--scraper/pinterest.php346
1 files changed, 269 insertions, 77 deletions
diff --git a/scraper/pinterest.php b/scraper/pinterest.php
index 3787f77..4188bce 100644
--- a/scraper/pinterest.php
+++ b/scraper/pinterest.php
@@ -13,31 +13,104 @@ class pinterest{
return [];
}
- private function get($proxy, $url, $get = []){
+ private function get($proxy, $url, $get = [], &$cookies, $header_data_post = null){
$curlproc = curl_init();
- if($get !== []){
+ if($header_data_post === null){
+
+ // handling GET
+
+ // extract cookies
+ $cookies_tmp = [];
+ curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
+
+ $length = strlen($header);
+
+ $header = explode(":", $header, 2);
+
+ if(trim(strtolower($header[0])) == "set-cookie"){
+
+ $cookie_tmp = explode("=", trim($header[1]), 2);
+
+ $cookies_tmp[trim($cookie_tmp[0])] =
+ explode(";", $cookie_tmp[1], 2)[0];
+ }
+
+ return $length;
+ });
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/javascript, */*, q=0.01",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Referer: https://ca.pinterest.com/",
+ "X-Requested-With: XMLHttpRequest",
+ "X-APP-VERSION: 78f8764",
+ "X-Pinterest-AppState: active",
+ "X-Pinterest-Source-Url: /",
+ "X-Pinterest-PWS-Handler: www/index.js",
+ "screen-dpr: 1",
+ "is-preload-enabled: 1",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "Connection: keep-alive",
+ "Alt-Used: ca.pinterest.com",
+ "Priority: u=0",
+ "TE: trailers"]
+ );
+
+ if($get !== []){
+ $get = http_build_query($get);
+ $url .= "?" . $get;
+ }
+ }else{
+
+ // handling POST (pagination)
$get = http_build_query($get);
- $url .= "?" . $get;
+
+ curl_setopt($curlproc, CURLOPT_HTTPHEADER,
+ ["User-Agent: " . config::USER_AGENT,
+ "Accept: application/json, text/javascript, */*, q=0.01",
+ "Accept-Language: en-US,en;q=0.5",
+ "Accept-Encoding: gzip",
+ "Content-Type: application/x-www-form-urlencoded",
+ "Content-Length: " . strlen($get),
+ "Referer: https://ca.pinterest.com/",
+ "X-Requested-With: XMLHttpRequest",
+ "X-APP-VERSION: 78f8764",
+ "X-CSRFToken: " . $cookies["csrf"],
+ "X-Pinterest-AppState: active",
+ "X-Pinterest-Source-Url: /search/pins/?rs=ac&len=2&q=" . urlencode($header_data_post) . "&eq=" . urlencode($header_data_post),
+ "X-Pinterest-PWS-Handler: www/search/[scope].js",
+ "screen-dpr: 1",
+ "is-preload-enabled: 1",
+ "Origin: https://ca.pinterest.com",
+ "DNT: 1",
+ "Sec-GPC: 1",
+ "Sec-Fetch-Dest: empty",
+ "Sec-Fetch-Mode: cors",
+ "Sec-Fetch-Site: same-origin",
+ "Connection: keep-alive",
+ "Alt-Used: ca.pinterest.com",
+ "Cookie: " . $cookies["cookie"],
+ "TE: trailers"]
+ );
+
+ curl_setopt($curlproc, CURLOPT_POST, true);
+ curl_setopt($curlproc, CURLOPT_POSTFIELDS, $get);
}
curl_setopt($curlproc, CURLOPT_URL, $url);
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
- curl_setopt($curlproc, CURLOPT_HTTPHEADER,
- ["User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/110.0",
- "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
- "Accept-Language: en-US,en;q=0.5",
- "Accept-Encoding: gzip",
- "DNT: 1",
- "Connection: keep-alive",
- "Upgrade-Insecure-Requests: 1",
- "Sec-Fetch-Dest: document",
- "Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
- "Sec-Fetch-User: ?1"]
- );
+
+ // http2 bypass
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
@@ -54,6 +127,26 @@ class pinterest{
throw new Exception(curl_error($curlproc));
}
+ if($header_data_post === null){
+
+ if(!isset($cookies_tmp["csrftoken"])){
+
+ throw new Exception("Failed to grep CSRF token");
+ }
+
+ $cookies = "";
+
+ foreach($cookies_tmp as $cookie_name => $cookie_value){
+
+ $cookies .= $cookie_name . "=" . $cookie_value . "; ";
+ }
+
+ $cookies = [
+ "csrf" => $cookies_tmp["csrftoken"],
+ "cookie" => rtrim($cookies, " ;")
+ ];
+ }
+
curl_close($curlproc);
return $data;
}
@@ -62,17 +155,68 @@ class pinterest{
if($get["npt"]){
- // @TODO
- // post data for next page
- $data = [
- "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
- "data" =>
- json_encode(
+ [$data, $proxy] =
+ $this->backend->get(
+ $get["npt"], "images"
+ );
+
+ $data = json_decode($data, true);
+
+ $search = $data["q"];
+ $cookies = $data["cookies"];
+
+ try{
+ $json =
+ $this->get(
+ $proxy,
+ "https://ca.pinterest.com/resource/BaseSearchResource/get/",
[
- // {"options":{"applied_filters":null,"appliedProductFilters":"---","article":null,"auto_correction_disabled":false,"corpus":null,"customized_rerank_type":null,"domains":null,"filters":null,"journey_depth":null,"page_size":null,"price_max":null,"price_min":null,"query_pin_sigs":null,"query":"higurashi","redux_normalize_feed":true,"rs":"typed","scope":"pins","selected_one_bar_modules":null,"source_id":null,"source_module_id":null,"top_pin_id":null,"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1ZsWlVRbXhpVmtreVZsZHpOV0pIU2tkV2FscFhVbXhhVkZreU1WSmtNREZWVjIxR1RrMXNTbEJXYlhSaFVtMVdjMVZ1U2xaaWEzQnpXVlJPVTJWV1pISlhhM1JYVm10V05sVldVbE5XVjBwMVVXMUdWVll6VFhoVWJYaFhWMVp3Ums1V1RsTmlSbGt5Vm10YWFtVkdWbkpOU0dSUFZsZG9XRmxzWkc5VlZscHlWbGhrYkdKR1NubFdWelZQWVVaYWRHVkVRbFppUmtwVVZrUktWMlJIVWtWV2JHaHBVakZLU0Zkc1pEUmtNVnBZVW10b2FsSXdXbkJXYlRWRFpHeGFSMWRzVG1oaGVrWllXV3RvVTFVeFpFaFZiRUpoVm5wRk1GbHFSbXRYVjA1R1YyczFWMVpHV2pSWFZtaDNVakZrY2sxWVRsaGlhM0JXV1ZSR1MyRkdiRlZTYm1SVVVteHdXbGxWVlRGVk1VbDVWRmhrVjAxdVVuWlVhMXBTWlVaT2MxcEhSbE5TTWswMVdtdGFWMU5YU2paVmJYaFRUVmhDUjFZeU5YZFVNVkY0VjJ0b1ZXRnJOVlpVVmxwTFVURndXR042VmxOV2ExcGFXVlZWTlZVeFNYZE5WRTVYVWtWYVZGWkhNVTlXTVU1WllVWk9hR1ZyV2s1WFZ6QXhZakpPVjFWWWFHRlNWbkJRVm14U1IwMUdXWGxOVkVKVlRWWnNORll5TURWV1YwVjVWV3hDV21FeGNETmFSVnByVjFkS1IyTkhhR2xYUjJkM1ZtdGFhMlF4VVhsVGJGcE9Wa1p3YjFwWGVFdFZWbFp4VW14YWJGWnRVbHBaTUdoTFZHMUtTR1ZJYUZkV2VrWjJWMVphU21ReVJYcGpSbFpwVW10d1RGZHJVa0pPVms1SFZHNVNUbFl3V2xoVmJYUldaVVpaZUZremFGUk5hM0JYVkZaYVYyRkZNSGxWYkVKYVlrWlZlRnBGV210WFIwNUpVMnMxVTFaR1dscFdWekI0VFVaV1IxTllaR3BUUlhCb1dWUkdWbVZHVm5SbFJuQnNZbFpKTWxSVlVYaFBSVGxGV1hwR1QyVnJSVEZVVlZKT1RrVXhSVkpVUWs5bGJFVXhWRmhzZDFOR1ZsWmtNMFp0VWpGYWIxZFhjRXBsUlRGSVZWaHdUbFl4YTNoVVZWSnFUVVUxV0ZadGFFOVNSVnB6Vkd0a1drMUdiRFpUVkVaT1pXMWplRmRzVWxkaFJuQllWVlJTVDJWdFRqWlVNVkpTWlZad2NWcEhkRTlsYTFwMFZGVlNhMkpWTVZWVFZFcE9Wa1pzTmxkWE1WSk9WVEYwVlcweFVGWXdXVFJXUjNSWFYwZGFRbEJVTVRoUFJHTXhUbnBCTlUxRVRUUk5SRVV3VG5wUk5VMTVjRWhWVlhkeFprUlZlRTlFVVRKWlZHc3lUMWRSTWsxVVVUSk9iVnBvV1RKWmVrNTZXWGhPTWs1cFQwUkZNVTlFVm1sTlZGcHBUV3BTYTFsWFRtcE9SR015VG1wVk5GbHFaR2haVjFacldWUmFiVmxxWkdoYVZGWnFUa1JXT0ZSclZsaG1RVDA5fFVIbzVhRkpYZUc1WFYyUlpWVEpHYkdGNk1XWk5ha1ptVFZSR09FOUVZekZPZWtFMVRVUk5ORTFFUlRCT2VsRTFUWGx3U0ZWVmQzRm1SMWw1VFZSUk1WbDZUVEJhUjFGNVQxZFNhVnB0VlRGT1JFVXdXVlJuZVU1cVRUUk5hbU40VDBSSk1VNXFWVEZOYlZwcVdsUnJlRTFFVVhwWmVsVjNXbXBvYkU1dFJYbE9ha0Y2VDFSSk5VMTZWVEJaYWtJNFZHdFdXR1pCUFQwPXxOb25lfDg3NTcwOTAzODAxNDc0OTMqR1FMKnwzMjM3YjM3ZGNhMGU3YjYyYzYzYzAyZGJkNGU1MjdlNzMyMTExMTNlMmUyMzEyOWM2MDAzYmU1ZTlmZjkwYjAwfE5FV3w="]},"context":{}}
- ]
+ "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
+ "data" => json_encode(
+ [
+ "options" => [
+ "applied_unified_filters" => null,
+ "appliedProductFilters" => "---",
+ "article" => null,
+ "auto_correction_disabled" => false,
+ "corpus" => null,
+ "customized_rerank_type" => null,
+ "domains" => null,
+ "dynamicPageSizeExpGroup" => null,
+ "filters" => null,
+ "journey_depth" => null,
+ "page_size" => null,
+ "price_max" => null,
+ "price_min" => null,
+ "query_pin_sigs" => null,
+ "query" => $data["q"],
+ "redux_normalize_feed" => true,
+ "request_params" => null,
+ "rs" => "typed",
+ "scope" => "pins",
+ "selected_one_bar_modules" => null,
+ "source_id" => null,
+ "source_module_id" => null,
+ "source_url" => "/search/pins/?q=" . urlencode($search) . "&rs=typed",
+ "top_pin_id" => null,
+ "top_pin_ids" => null,
+ "bookmarks" => [
+ $data["bookmark"]
+ ]
+ ],
+ "context" => []
+ ],
+ JSON_UNESCAPED_SLASHES
+ )
+ ],
+ $cookies,
+ $search
);
- ];
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
}else{
@@ -81,27 +225,45 @@ class pinterest{
throw new Exception("Search term is empty!");
}
+
+ // https://ca.pinterest.com/resource/BaseSearchResource/get/?source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac&data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D&_=1736116313987
+ // source_url=%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac
+ // &data=%7B%22options%22%3A%7B%22applied_unified_filters%22%3Anull%2C%22appliedProductFilters%22%3A%22---%22%2C%22article%22%3Anull%2C%22auto_correction_disabled%22%3Afalse%2C%22corpus%22%3Anull%2C%22customized_rerank_type%22%3Anull%2C%22domains%22%3Anull%2C%22dynamicPageSizeExpGroup%22%3Anull%2C%22filters%22%3Anull%2C%22journey_depth%22%3Anull%2C%22page_size%22%3Anull%2C%22price_max%22%3Anull%2C%22price_min%22%3Anull%2C%22query_pin_sigs%22%3Anull%2C%22query%22%3A%22higurashi%20when%20they%20cry%22%2C%22redux_normalize_feed%22%3Atrue%2C%22request_params%22%3Anull%2C%22rs%22%3A%22ac%22%2C%22scope%22%3A%22pins%22%2C%22selected_one_bar_modules%22%3Anull%2C%22source_id%22%3Anull%2C%22source_module_id%22%3Anull%2C%22source_url%22%3A%22%2Fsearch%2Fpins%2F%3Feq%3Dhigurashi%26etslf%3D5966%26len%3D2%26q%3Dhigurashi%2520when%2520they%2520cry%26rs%3Dac%22%2C%22top_pin_id%22%3Anull%2C%22top_pin_ids%22%3Anull%7D%2C%22context%22%3A%7B%7D%7D
+ // &_=1736116313987
+
+ $source_url = "/search/pins/?q=" . urlencode($search) . "&rs=" . urlencode($search);
$filter = [
- "source_url" => "/search/pins/?q=" . urlencode($search),
+ "source_url" => $source_url,
"rs" => "typed",
"data" =>
json_encode(
[
"options" => [
- "article" => null,
- "applied_filters" => null,
+ "applied_unified_filters" => null,
"appliedProductFilters" => "---",
- "auto_correction_disabled" => false,
+ "article" => null,
"corpus" => null,
"customized_rerank_type" => null,
+ "domains" => null,
+ "dynamicPageSizeExpGroup" => null,
"filters" => null,
- "query" => $search,
+ "journey_depth" => null,
+ "page_size" => null,
+ "price_max" => null,
+ "price_min" => null,
"query_pin_sigs" => null,
+ "query" => $search,
"redux_normalize_feed" => true,
- "rs" => "typed",
+ "request_params" => null,
+ "rs" => "ac",
"scope" => "pins", // pins, boards, videos,
- "source_id" => null
+ "selected_one_bar_modules" => null,
+ "source_id" => null,
+ "source_module_id" => null,
+ "source_url" => $source_url,
+ "top_pin_id" => null,
+ "top_pin_ids" => null
],
"context" => []
]
@@ -110,24 +272,26 @@ class pinterest{
];
$proxy = $this->backend->get_ip();
- }
-
- try{
- $json =
- json_decode(
+ $cookies = [];
+
+ try{
+ $json =
$this->get(
$proxy,
- "https://www.pinterest.ca/resource/BaseSearchResource/get/",
- $filter
- ),
- true
- );
-
- }catch(Exception $error){
-
- throw new Exception("Failed to fetch JSON");
+ "https://ca.pinterest.com/resource/BaseSearchResource/get/",
+ $filter,
+ $cookies,
+ null
+ );
+
+ }catch(Exception $error){
+
+ throw new Exception("Failed to fetch JSON");
+ }
}
+ $json = json_decode($json, true);
+
if($json === null){
throw new Exception("Failed to decode JSON");
@@ -139,6 +303,60 @@ class pinterest{
"image" => []
];
+ if(
+ !isset(
+ $json["resource_response"]
+ ["status"]
+ )
+ ){
+
+ throw new Exception("Unknown API failure");
+ }
+
+ if($json["resource_response"]["status"] != "success"){
+
+ $status = "Got non-OK response: " . $json["resource_response"]["status"];
+
+ if(
+ isset(
+ $json["resource_response"]["message"]
+ )
+ ){
+
+ $status .= " - " . $json["resource_response"]["message"];
+ }
+
+ throw new Exception($status);
+ }
+
+ if(
+ isset(
+ $json["resource_response"]["sensitivity"]
+ ["notices"][0]["description"]["text"]
+ )
+ ){
+
+ throw new Exception(
+ "Pinterest returned a notice: " .
+ $json["resource_response"]["sensitivity"]["notices"][0]["description"]["text"]
+ );
+ }
+
+ // get NPT
+ if(isset($json["resource_response"]["bookmark"])){
+
+ $out["npt"] =
+ $this->backend->store(
+ json_encode([
+ "q" => $search,
+ "bookmark" => $json["resource_response"]["bookmark"],
+ "cookies" => $cookies
+ ]),
+ "images",
+ $proxy
+ );
+ }
+
foreach(
$json
["resource_response"]
@@ -150,6 +368,7 @@ class pinterest{
switch($item["type"]){
case "pin":
+ case "board":
/*
Handle image object
@@ -206,42 +425,15 @@ class pinterest{
"height" => (int)$thumb["height"]
]
],
- "url" => "https://www.pinterest.com/pin/" . $item["id"]
+ "url" =>
+ $item["link"] === null ?
+ "https://ca.pinterest.com/pin/" . $item["id"] :
+ $item["link"]
];
break;
-
- case "board":
- if(isset($item["cover_pin"]["image_url"])){
-
- $image = [
- "url" => $item["cover_pin"]["image_url"],
- "width" => (int)$item["cover_pin"]["size"][0],
- "height" => (int)$item["cover_pin"]["size"][1]
- ];
- }elseif(isset($item["image_cover_url_hd"])){
- /*
- $image = [
- "url" =>
- "width" => null,
- "height" => null
- ];*/
- }
- break;
}
}
return $out;
}
-
- private function getfullresimage($image, $has_og){
-
- $has_og = $has_og ? "1200x" : "originals";
-
- return
- preg_replace(
- '/https:\/\/i\.pinimg\.com\/[^\/]+\//',
- "https://i.pinimg.com/" . $has_og . "/",
- $image
- );
- }
}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage