aboutsummaryrefslogtreecommitdiffstats
path: root/scraper
diff options
context:
space:
mode:
Diffstat (limited to 'scraper')
-rw-r--r--scraper/mwmbl.php78
1 files changed, 73 insertions, 5 deletions
diff --git a/scraper/mwmbl.php b/scraper/mwmbl.php
index 671ec78..f2f8b70 100644
--- a/scraper/mwmbl.php
+++ b/scraper/mwmbl.php
@@ -27,18 +27,24 @@ class mwmbl{
curl_setopt($curlproc, CURLOPT_URL, $url);
+ // use http2
+ curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
+
curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
curl_setopt($curlproc, CURLOPT_HTTPHEADER,
["User-Agent: " . config::USER_AGENT,
"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language: en-US,en;q=0.5",
"Accept-Encoding: gzip",
+ "Referer: https://beta.mwmbl.org/",
"DNT: 1",
+ "Sec-GPC: 1",
"Connection: keep-alive",
"Upgrade-Insecure-Requests: 1",
"Sec-Fetch-Dest: document",
"Sec-Fetch-Mode: navigate",
- "Sec-Fetch-Site: none",
+ "Sec-Fetch-Site: same-origin",
+ "Priority: u=0, i",
"Sec-Fetch-User: ?1"]
);
@@ -46,7 +52,7 @@ class mwmbl{
curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
- curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
+ curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
$this->backend->assign_proxy($curlproc, $proxy);
@@ -72,14 +78,14 @@ class mwmbl{
try{
$html = $this->get(
$this->backend->get_ip(), // no next page!
- "https://mwmbl.org/app/home/",
+ "https://beta.mwmbl.org/",
[
"q" => $search
]
);
}catch(Exception $error){
- throw new Exception("Failed to fetch HTML");
+ throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup.");
}
$out = [
@@ -115,6 +121,68 @@ class mwmbl{
$this->fuckhtml
->getElementsByTagName("p");
+ $sublinks = [];
+
+ $mores =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "result-link-more",
+ "div"
+ );
+
+ foreach($mores as $more){
+
+ $this->fuckhtml->load($more);
+
+ $as =
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "more",
+ "a"
+ );
+
+ if(count($as) === 0){
+
+ // ?? invalid
+ continue;
+ }
+
+ $sublinks[] = [
+ "title" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "more-title",
+ "span"
+ )[0]
+ )
+ ),
+ "description" =>
+ $this->titledots(
+ $this->fuckhtml
+ ->getTextContent(
+ $this->fuckhtml
+ ->getElementsByClassName(
+ "more-extract",
+ "span"
+ )[0]
+ )
+ ),
+ "url" =>
+ $this->fuckhtml
+ ->getTextContent(
+ $as[0]
+ ["attributes"]
+ ["href"]
+ )
+ ];
+ }
+
+ // reset
+ $this->fuckhtml->load($result);
+
$out["web"][] = [
"title" =>
$this->titledots(
@@ -153,7 +221,7 @@ class mwmbl{
"url" => null,
"ratio" => null
],
- "sublink" => [],
+ "sublink" => $sublinks,
"table" => []
];
}
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage