aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorlolcat <will@lolcat.ca>2025-08-10 21:49:51 -0400
committerlolcat <will@lolcat.ca>2025-08-10 21:49:51 -0400
commit2d63475b07781619e6fb818e08edd51e2fdb041e (patch)
treeefb61dc701db30ada976d1de61486e8ae532d482
parentae31274db9777dc9657fe9334b1c5c613c9566ed (diff)
fix MDN answers not rendering properly
-rw-r--r--scraper/ddg.php319
1 files changed, 171 insertions, 148 deletions
diff --git a/scraper/ddg.php b/scraper/ddg.php
index 8784da9..9470415 100644
--- a/scraper/ddg.php
+++ b/scraper/ddg.php
@@ -1046,20 +1046,38 @@ class ddg{
if(isset($json["Abstract"])){
- $description[] =
- [
- "type" => "text",
- "value" => $json["Abstract"]
- ];
+ $description = $this->parse_rich_text($json["Abstract"]);
+ }
+
+ if(
+ !isset($json["Image"]) ||
+ $json["Image"] == "" ||
+ $json["Image"] === null ||
+ $json["Image"] == "https://duckduckgo.com/i/"
+ ){
+
+ $image = null;
+ }else{
+
+ if(
+ preg_match(
+ '/^https?:\/\//',
+ $json["Image"]
+ )
+ ){
+
+ $image = $json["Image"];
+ }else{
+
+ $image = "https://duckduckgo.com" . $json["Image"];
+ }
}
$out["answer"][] = [
"title" => $json["Heading"],
"description" => $description,
"url" => $json["AbstractURL"],
- "thumb" =>
- (!isset($json["Image"]) || $json["Image"] == "" || $json["Image"] === null) ?
- null : "https://duckduckgo.com" . $json["Image"],
+ "thumb" => $image,
"table" => $table,
"sublink" => $sublinks
];
@@ -1382,146 +1400,7 @@ class ddg{
isset($answer["Abstract"])
){
- // got some data
- $description = [];
- $html = &$answer["Abstract"];
-
- // pre-process the html, remove useless elements
- $html =
- strip_tags(
- $html,
- [
- "h1", "h2", "h3", "h4", "h5", "h6", "h7",
- "pre", "code"
- ]
- );
-
- $html =
- preg_replace(
- '/<(\/?)pre *[^>]*>\s*<\/?code *[^>]*>/i',
- '<$1pre>',
- $html
- );
-
- $this->fuckhtml->load($html);
-
- $tags =
- $this->fuckhtml
- ->getElementsByTagName(
- "*"
- );
-
- if(count($tags) === 0){
-
- $description = [
- "type" => "text",
- "value" =>
- trim(
- $this->fuckhtml
- ->getTextContent(
- substr(
- $html,
- $start,
- $tag["startPos"] - $start
- ),
- true,
- false
- )
- )
- ];
- }else{
-
- $start = 0;
- $was_code_block = true;
- foreach($tags as $tag){
-
- $text =
- $this->fuckhtml
- ->getTextContent(
- substr(
- $html,
- $start,
- $tag["startPos"] - $start
- ),
- true,
- false
- );
-
- if($was_code_block){
-
- $text = ltrim($text);
- $was_code_block = false;
- }
-
- $description[] = [
- "type" => "text",
- "value" => $text
- ];
-
- switch($tag["tagName"]){
-
- case "pre":
- $append = "code";
- $was_code_block = true;
- $c = count($description) - 1;
- $description[$c]["value"] =
- rtrim($description[$c]["value"]);
- break;
-
- case "code":
- $append = "inline_code";
- $c = count($description) - 1;
- $description[$c]["value"] =
- rtrim($description[$c]["value"]) . " ";
- break;
-
- case "h1":
- case "h2":
- case "h3":
- case "h4":
- case "h5":
- case "h6":
- case "h7":
- $append = "title";
- $c = count($description) - 1;
- $description[$c]["value"] =
- rtrim($description[$c]["value"]);
- break;
- }
-
- $description[] = [
- "type" => $append,
- "value" =>
- trim(
- $this->fuckhtml
- ->getTextContent(
- $tag,
- true,
- false
- )
- )
- ];
-
- $start = $tag["endPos"];
- }
-
- // shit out remainder
- $description[] = [
- "type" => "text",
- "value" =>
- trim(
- $this->fuckhtml
- ->getTextContent(
- substr(
- $html,
- $start
- ),
- true,
- false
- )
- )
- ];
- }
+ $description = $this->parse_rich_text($answer["Abstract"]);
$out["answer"][] = [
"title" => $title,
@@ -2064,6 +1943,150 @@ class ddg{
return $out;
}
+ private function parse_rich_text($html){
+
+ $description = [];
+
+ // pre-process the html, remove useless elements
+ $html =
+ strip_tags(
+ $html,
+ [
+ "h1", "h2", "h3", "h4", "h5", "h6", "h7",
+ "pre", "code"
+ ]
+ );
+
+ $html =
+ preg_replace(
+ '/<(\/?)pre *[^>]*>\s*<\/?code *[^>]*>/i',
+ '<$1pre>',
+ $html
+ );
+
+ $this->fuckhtml->load($html);
+
+ $tags =
+ $this->fuckhtml
+ ->getElementsByTagName(
+ "*"
+ );
+
+ if(count($tags) === 0){
+
+ $description = [
+ "type" => "text",
+ "value" =>
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ substr(
+ $html,
+ $start,
+ $tag["startPos"] - $start
+ ),
+ true,
+ false
+ )
+ )
+ ];
+ }else{
+
+ $start = 0;
+ $was_code_block = true;
+ foreach($tags as $tag){
+
+ $text =
+ $this->fuckhtml
+ ->getTextContent(
+ substr(
+ $html,
+ $start,
+ $tag["startPos"] - $start
+ ),
+ true,
+ false
+ );
+
+ if($was_code_block){
+
+ $text = ltrim($text);
+ $was_code_block = false;
+ }
+
+ $description[] = [
+ "type" => "text",
+ "value" => $text
+ ];
+
+ switch($tag["tagName"]){
+
+ case "pre":
+ $append = "code";
+ $was_code_block = true;
+ $c = count($description) - 1;
+ $description[$c]["value"] =
+ rtrim($description[$c]["value"]);
+ break;
+
+ case "code":
+ $append = "inline_code";
+ $c = count($description) - 1;
+ $description[$c]["value"] =
+ rtrim($description[$c]["value"]) . " ";
+ break;
+
+ case "h1":
+ case "h2":
+ case "h3":
+ case "h4":
+ case "h5":
+ case "h6":
+ case "h7":
+ $append = "title";
+ $c = count($description) - 1;
+ $description[$c]["value"] =
+ rtrim($description[$c]["value"]);
+ break;
+ }
+
+ $description[] = [
+ "type" => $append,
+ "value" =>
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ $tag,
+ true,
+ false
+ )
+ )
+ ];
+
+ $start = $tag["endPos"];
+ }
+
+ // shit out remainder
+ $description[] = [
+ "type" => "text",
+ "value" =>
+ trim(
+ $this->fuckhtml
+ ->getTextContent(
+ substr(
+ $html,
+ $start
+ ),
+ true,
+ false
+ )
+ )
+ ];
+ }
+
+ return $description;
+ }
+
private function titledots($title){
$substr = substr($title, -3);
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage