aboutsummaryrefslogtreecommitdiffstats
path: root/scraper/marginalia.php
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2024-11-07 00:12:06 -0500
committerlolcat <will@lolcat.ca>2024-11-07 00:12:06 -0500
commite83865be492b887dc1c976e21449fc71d277e3d5 (patch)
tree2ee87e70f271ee8db075c571478c1ff62d0ea6fa /scraper/marginalia.php
parent68dd7f29f61122af70dc678c490c7482eced59cd (diff)
added pagination
Diffstat (limited to 'scraper/marginalia.php')
-rw-r--r--scraper/marginalia.php125
1 files changed, 104 insertions, 21 deletions
diff --git a/scraper/marginalia.php b/scraper/marginalia.php
index 2a2c1e6..2169fbc 100644
--- a/scraper/marginalia.php
+++ b/scraper/marginalia.php
@@ -220,6 +220,7 @@ class marginalia{
"related" => []
];
+ // API scraper
if(config::MARGINALIA_API_KEY !== null){
try{
@@ -263,34 +264,57 @@ class marginalia{
return $out;
}
- // no more cloudflare!! Parse html by default
- $params = [
- "query" => $search
- ];
+ // HTML parser
+ $proxy = $this->backend->get_ip();
- foreach(["adtech", "recent", "intitle"] as $v){
+ if($get["npt"]){
+
+ [$params, $proxy] =
+ $this->backend->get(
+ $get["npt"],
+ "web"
+ );
+
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://search.marginalia.nu/search?" . $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
+
+ }else{
+ $params = [
+ "query" => $search
+ ];
- if($get[$v] == "yes"){
+ foreach(["adtech", "recent", "intitle"] as $v){
- switch($v){
+ if($get[$v] == "yes"){
- case "adtech": $params["adtech"] = "reduce"; break;
- case "recent": $params["recent"] = "recent"; break;
- case "adtech": $params["searchTitle"] = "title"; break;
+ switch($v){
+
+ case "adtech": $params["adtech"] = "reduce"; break;
+ case "recent": $params["recent"] = "recent"; break;
+ case "adtech": $params["searchTitle"] = "title"; break;
+ }
}
}
- }
-
- try{
- $html =
- $this->get(
- $this->backend->get_ip(),
- "https://search.marginalia.nu/search",
- $params
- );
- }catch(Exception $error){
- throw new Exception("Failed to get HTML");
+ try{
+ $html =
+ $this->get(
+ $proxy,
+ "https://search.marginalia.nu/search",
+ $params
+ );
+ }catch(Exception $error){
+
+ throw new Exception("Failed to get HTML");
+ }
}
$this->fuckhtml->load($html);
@@ -387,6 +411,65 @@ class marginalia{
];
}
+ // get next page
+ $this->fuckhtml->load($html);
+
+ $pagination =
+ $this->fuckhtml
+ ->getElementsByAttributeValue(
+ "aria-label",
+ "pagination",
+ "nav"
+ );
+
+ if(count($pagination) === 0){
+
+ // no pagination
+ return $out;
+ }
+
+ $this->fuckhtml->load($pagination[0]);
+
+ $pages =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "page-link",
+ "a"
+ );
+
+ $found_current_page = false;
+
+ foreach($pages as $page){
+
+ if(
+ stripos(
+ $page["attributes"]["class"],
+ "active"
+ ) !== false
+ ){
+
+ $found_current_page = true;
+ continue;
+ }
+
+ if($found_current_page){
+
+ // we found current page index, and we iterated over
+ // the next page <a>
+
+ $out["npt"] =
+ $this->backend->store(
+ parse_url(
+ $page["attributes"]["href"],
+ PHP_URL_QUERY
+ ),
+ "web",
+ $proxy
+ );
+ break;
+ }
+ }
+
return $out;
}
}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage