aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorfrosty <frosty@illegalfirearms.store>2026-01-06 23:46:24 -0500
committerfrosty <frosty@illegalfirearms.store>2026-01-06 23:46:24 -0500
commitf3aa7ca0bc2ef7c286609e8f87d07cc2568093af (patch)
tree269352af1238b4dd7c3e2e023f71a27b858cdb34 /src
rebase(d)
Diffstat (limited to 'src')
-rw-r--r--src/Config.h2
-rw-r--r--src/Infobox/Calculator.c115
-rw-r--r--src/Infobox/Calculator.h9
-rw-r--r--src/Infobox/Infobox.c13
-rw-r--r--src/Infobox/Infobox.h13
-rw-r--r--src/Infobox/Wikipedia.c165
-rw-r--r--src/Infobox/Wikipedia.h9
-rw-r--r--src/Main.c36
-rw-r--r--src/Routes/Home.c14
-rw-r--r--src/Routes/Home.h8
-rw-r--r--src/Routes/Images.c277
-rw-r--r--src/Routes/Images.h8
-rw-r--r--src/Routes/Search.c273
-rw-r--r--src/Routes/Search.h8
-rw-r--r--src/Scraping/Scraping.c468
-rw-r--r--src/Scraping/Scraping.h34
-rw-r--r--src/Utility/Display.c46
-rw-r--r--src/Utility/Display.h6
-rw-r--r--src/Utility/Unescape.c80
-rw-r--r--src/Utility/Unescape.h10
-rw-r--r--src/Utility/Utility.c8
-rw-r--r--src/Utility/Utility.h6
22 files changed, 1608 insertions, 0 deletions
diff --git a/src/Config.h b/src/Config.h
new file mode 100644
index 0000000..b5695b7
--- /dev/null
+++ b/src/Config.h
@@ -0,0 +1,2 @@
+static int port = 5000;
+static char host[] = "0.0.0.0"; \ No newline at end of file
diff --git a/src/Infobox/Calculator.c b/src/Infobox/Calculator.c
new file mode 100644
index 0000000..b80ce21
--- /dev/null
+++ b/src/Infobox/Calculator.c
@@ -0,0 +1,115 @@
+#include "Calculator.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <ctype.h>
+
+static char logic_log[4096];
+
+typedef struct {
+ const char *buffer;
+ int pos;
+} Parser;
+
+static double parse_expression(Parser *p);
+
+static void skip_ws(Parser *p) {
+ while (p->buffer[p->pos] == ' ') p->pos++;
+}
+
+static double parse_factor(Parser *p) {
+ skip_ws(p);
+ if (p->buffer[p->pos] == '-') {
+ p->pos++;
+ return -parse_factor(p);
+ }
+ if (p->buffer[p->pos] == '(') {
+ p->pos++;
+ double res = parse_expression(p);
+ if (p->buffer[p->pos] == ')') p->pos++;
+ return res;
+ }
+ char *endptr;
+ double val = strtod(&p->buffer[p->pos], &endptr);
+ p->pos = (int)(endptr - p->buffer);
+ return val;
+}
+
+static double parse_term(Parser *p) {
+ double left = parse_factor(p);
+ while (1) {
+ skip_ws(p);
+ char op = p->buffer[p->pos];
+ if (op == '*' || op == '/') {
+ p->pos++;
+ double right = parse_factor(p);
+ double old = left;
+ left = (op == '*') ? left * right : left / right;
+
+ char step[256];
+
+ snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op,
+ right, left);
+ strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1);
+ } else
+ break;
+ }
+ return left;
+}
+
+static double parse_expression(Parser *p) {
+ double left = parse_term(p);
+ while (1) {
+ skip_ws(p);
+ char op = p->buffer[p->pos];
+ if (op == '+' || op == '-') {
+ p->pos++;
+ double right = parse_term(p);
+ double old = left;
+ left = (op == '+') ? left + right : left - right;
+
+ char step[256];
+
+ snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op,
+ right, left);
+ strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1);
+ } else
+ break;
+ }
+ return left;
+}
+
+double evaluate(const char *expr) {
+ logic_log[0] = '\0';
+ if (!expr || strlen(expr) == 0) return 0.0;
+ Parser p = {expr, 0};
+ return parse_expression(&p);
+}
+
+InfoBox fetch_calc_data(char *math_input) {
+ InfoBox info = {NULL, NULL, NULL, NULL};
+ if (!math_input) return info;
+
+ double result = evaluate(math_input);
+
+ char html_output[5120];
+ snprintf(html_output, sizeof(html_output),
+ "<div class='calc-container' style='line-height: 1.6;'>"
+ "%s"
+ "<div style='margin-top: 8px; border-top: 1px solid #eee; "
+ "padding-top: 8px; font-size: 1.2em;'>"
+ "<b>%g</b>"
+ "</div>"
+ "</div>",
+ strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>",
+ result);
+
+ info.title = strdup("Calculation");
+ info.extract = strdup(html_output);
+ info.thumbnail_url =
+ strdup("/static/calculation.svg");
+ info.url = strdup("#");
+
+ return info;
+}
diff --git a/src/Infobox/Calculator.h b/src/Infobox/Calculator.h
new file mode 100644
index 0000000..275aed6
--- /dev/null
+++ b/src/Infobox/Calculator.h
@@ -0,0 +1,9 @@
+#ifndef CALCULATOR_H
+#define CALCULATOR_H
+
+#include "Infobox.h"
+
+double evaluate(const char *expr);
+InfoBox fetch_calc_data(char *math_input);
+
+#endif
diff --git a/src/Infobox/Infobox.c b/src/Infobox/Infobox.c
new file mode 100644
index 0000000..5043c05
--- /dev/null
+++ b/src/Infobox/Infobox.c
@@ -0,0 +1,13 @@
+#include "Infobox.h"
+#include <stdlib.h>
+
+void free_infobox(InfoBox *info) {
+ if (info->title)
+ free(info->title);
+ if (info->thumbnail_url)
+ free(info->thumbnail_url);
+ if (info->extract)
+ free(info->extract);
+ if (info->url)
+ free(info->url);
+}
diff --git a/src/Infobox/Infobox.h b/src/Infobox/Infobox.h
new file mode 100644
index 0000000..a052b80
--- /dev/null
+++ b/src/Infobox/Infobox.h
@@ -0,0 +1,13 @@
+#ifndef INFOBOX_H
+#define INFOBOX_H
+
+typedef struct {
+ char *title;
+ char *thumbnail_url;
+ char *extract;
+ char *url;
+} InfoBox;
+
+void free_infobox(InfoBox *info);
+
+#endif
diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c
new file mode 100644
index 0000000..ed4645f
--- /dev/null
+++ b/src/Infobox/Wikipedia.c
@@ -0,0 +1,165 @@
+#include "Wikipedia.h"
+#include <curl/curl.h>
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+struct WikiMemoryStruct {
+ char *memory;
+ size_t size;
+};
+
+static void shorten_summary(char **extract_ptr, int max_chars) {
+ if (!extract_ptr || !*extract_ptr) return;
+
+ char *text = *extract_ptr;
+ int len = strlen(text);
+
+ if (len <= max_chars) return;
+
+ int end_pos = max_chars;
+ for (int i = max_chars; i > (max_chars / 2); i--) {
+ if (text[i] == '.' || text[i] == '!' || text[i] == '?') {
+ end_pos = i + 1;
+ break;
+ }
+ }
+
+ char *new_text = (char *)malloc(end_pos + 4);
+
+ if (new_text) {
+ strncpy(new_text, text, end_pos);
+ new_text[end_pos] = '\0';
+ strcat(new_text, "...");
+ free(*extract_ptr);
+ *extract_ptr = new_text;
+ }
+}
+
+static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp;
+
+ char *ptr = realloc(mem->memory, mem->size + realsize + 1);
+ if (ptr == NULL) {
+ fprintf(stderr, "Not enough memory (realloc returned NULL)\n");
+ return 0;
+ }
+
+ mem->memory = ptr;
+ memcpy(&(mem->memory[mem->size]), contents, realsize);
+ mem->size += realsize;
+ mem->memory[mem->size] = 0;
+
+ return realsize;
+}
+
+static void extract_wiki_info(xmlNode *node, InfoBox *info) {
+ xmlNode *cur_node = NULL;
+
+ for (cur_node = node; cur_node; cur_node = cur_node->next) {
+ if (cur_node->type == XML_ELEMENT_NODE) {
+ if (strcmp((const char *)cur_node->name, "page") == 0) {
+ xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title");
+ if (title) {
+ info->title = strdup((const char *)title);
+
+ const char *base_article_url = "https://en.wikipedia.org/wiki/";
+ char *formatted_title = strdup((const char *)title);
+ for (int i = 0; formatted_title[i]; i++) {
+ if (formatted_title[i] == ' ') formatted_title[i] = '_';
+ }
+
+ info->url =
+ malloc(strlen(base_article_url) + strlen(formatted_title) + 1);
+ if (info->url) {
+ strcpy(info->url, base_article_url);
+ strcat(info->url, formatted_title);
+ }
+ free(formatted_title);
+ xmlFree(title);
+ }
+ }
+
+ if (strcmp((const char *)cur_node->name, "thumbnail") == 0) {
+ xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source");
+ if (source) {
+ info->thumbnail_url = strdup((const char *)source);
+ xmlFree(source);
+ }
+ }
+
+ if (strcmp((const char *)cur_node->name, "extract") == 0) {
+ xmlChar *content = xmlNodeGetContent(cur_node);
+ if (content) {
+ info->extract = strdup((const char *)content);
+
+ shorten_summary(&(info->extract), 300);
+ xmlFree(content);
+ }
+ }
+ }
+ extract_wiki_info(cur_node->children, info);
+ }
+}
+
+InfoBox fetch_wiki_data(char *api_url) {
+ CURL *curl_handle;
+ CURLcode res;
+ struct WikiMemoryStruct chunk;
+ InfoBox info = {NULL, NULL, NULL, NULL};
+
+ chunk.memory = malloc(1);
+ chunk.size = 0;
+
+ curl_handle = curl_easy_init();
+
+ if (curl_handle) {
+ curl_easy_setopt(curl_handle, CURLOPT_URL, api_url);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION,
+ WikiWriteMemoryCallback);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
+ curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
+
+ res = curl_easy_perform(curl_handle);
+
+ if (res == CURLE_OK) {
+ xmlDocPtr doc =
+ xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
+ if (doc != NULL) {
+ xmlNode *root_element = xmlDocGetRootElement(doc);
+ extract_wiki_info(root_element, &info);
+ xmlFreeDoc(doc);
+ }
+ }
+
+ curl_easy_cleanup(curl_handle);
+ free(chunk.memory);
+ }
+
+ return info;
+}
+
+char *construct_wiki_url(const char *search_term) {
+ CURL *curl = curl_easy_init();
+ if (!curl) return NULL;
+
+ char *escaped_term = curl_easy_escape(curl, search_term, 0);
+ const char *base =
+ "https://en.wikipedia.org/w/"
+ "api.php?action=query&prop=extracts|pageimages&exintro&"
+ "explaintext&pithumbsize=400&format=xml&origin=*&titles=";
+
+ char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1);
+ if (full_url) {
+ strcpy(full_url, base);
+ strcat(full_url, escaped_term);
+ }
+
+ curl_free(escaped_term);
+ curl_easy_cleanup(curl);
+ return full_url;
+}
diff --git a/src/Infobox/Wikipedia.h b/src/Infobox/Wikipedia.h
new file mode 100644
index 0000000..8a8103e
--- /dev/null
+++ b/src/Infobox/Wikipedia.h
@@ -0,0 +1,9 @@
+#ifndef WIKIPEDIA_H
+#define WIKIPEDIA_H
+
+#include "Infobox.h"
+
+InfoBox fetch_wiki_data(char *api_url);
+char *construct_wiki_url(const char *search_term);
+
+#endif
diff --git a/src/Main.c b/src/Main.c
new file mode 100644
index 0000000..ad08f3e
--- /dev/null
+++ b/src/Main.c
@@ -0,0 +1,36 @@
+#include <beaker.h>
+#include <curl/curl.h>
+#include <libxml/parser.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "Config.h"
+#include "Routes/Home.h"
+#include "Routes/Images.h"
+#include "Routes/Search.h"
+
+int main() {
+ LIBXML_TEST_VERSION
+ xmlInitParser();
+
+ curl_global_init(CURL_GLOBAL_DEFAULT);
+
+ set_handler("/", home_handler);
+ set_handler("/search", results_handler);
+ set_handler("/images", images_handler);
+
+ fprintf(stderr, "Starting Omnisearch on %s:%d\n", host, port);
+
+ int result = beaker_run(host, port);
+
+ if (result != 0) {
+ fprintf(stderr, "Error: Beaker server failed to start.\n");
+ curl_global_cleanup();
+ xmlCleanupParser();
+ return EXIT_FAILURE;
+ }
+
+ curl_global_cleanup();
+ xmlCleanupParser();
+ return EXIT_SUCCESS;
+}
diff --git a/src/Routes/Home.c b/src/Routes/Home.c
new file mode 100644
index 0000000..81370ba
--- /dev/null
+++ b/src/Routes/Home.c
@@ -0,0 +1,14 @@
+#include "Home.h"
+#include <stdlib.h>
+
+int home_handler(UrlParams *params) {
+ (void)params;
+ TemplateContext ctx = new_context();
+ char *rendered_html = render_template("home.html", &ctx);
+ send_response(rendered_html);
+
+ free(rendered_html);
+ free_context(&ctx);
+
+ return 0;
+}
diff --git a/src/Routes/Home.h b/src/Routes/Home.h
new file mode 100644
index 0000000..5d01ab3
--- /dev/null
+++ b/src/Routes/Home.h
@@ -0,0 +1,8 @@
+#ifndef HOME_H
+#define HOME_H
+
+#include <beaker.h>
+
+int home_handler(UrlParams *params);
+
+#endif
diff --git a/src/Routes/Images.c b/src/Routes/Images.c
new file mode 100644
index 0000000..47e3a72
--- /dev/null
+++ b/src/Routes/Images.c
@@ -0,0 +1,277 @@
+#include "Images.h"
+#include "../Utility/Unescape.h"
+
+#include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+static void get_image_timestamp(char *buffer, size_t size) {
+ time_t now = time(NULL);
+ struct tm *t = localtime(&now);
+ if (t) {
+ strftime(buffer, size, "%Y-%m-%d %H:%M:%S", t);
+ }
+}
+
+#define IMG_LOG_INFO(msg, ...) \
+ { \
+ char ts[20]; \
+ get_image_timestamp(ts, sizeof(ts)); \
+ fprintf(stderr, "[%s] INFO [ImagesHandler] " msg "\n", ts, \
+ ##__VA_ARGS__); \
+ }
+
+#define IMG_LOG_ERROR(msg, ...) \
+ { \
+ char ts[20]; \
+ get_image_timestamp(ts, sizeof(ts)); \
+ fprintf(stderr, "[%s] ERROR [ImagesHandler] " msg "\n", ts, \
+ ##__VA_ARGS__); \
+ }
+
+struct MemoryBlock {
+ char *response;
+ size_t size;
+};
+
+static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ struct MemoryBlock *mem = (struct MemoryBlock *)userp;
+ char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1);
+ if (ptr == NULL) {
+ IMG_LOG_ERROR("Realloc failed in WriteCallback (out of memory)");
+ return 0;
+ }
+ mem->response = ptr;
+ memcpy(&(mem->response[mem->size]), data, realsize);
+ mem->size += realsize;
+ mem->response[mem->size] = 0;
+ return realsize;
+}
+
+static char *fetch_images_html(const char *url) {
+ CURL *curl_handle;
+ struct MemoryBlock chunk = {.response = malloc(1), .size = 0};
+ if (!chunk.response) {
+ IMG_LOG_ERROR("Initial malloc failed for fetch_images_html");
+ return NULL;
+ }
+
+ IMG_LOG_INFO("Initializing cURL handle for URL: %s", url);
+ curl_handle = curl_easy_init();
+ if (!curl_handle) {
+ IMG_LOG_ERROR("curl_easy_init() failed");
+ free(chunk.response);
+ return NULL;
+ }
+
+ curl_easy_setopt(curl_handle, CURLOPT_URL, url);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
+ curl_easy_setopt(
+ curl_handle, CURLOPT_USERAGENT,
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36");
+ curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L);
+
+ CURLcode res = curl_easy_perform(curl_handle);
+ if (res != CURLE_OK) {
+ IMG_LOG_ERROR("curl_easy_perform() failed: %s", curl_easy_strerror(res));
+ free(chunk.response);
+ curl_easy_cleanup(curl_handle);
+ return NULL;
+ }
+
+ IMG_LOG_INFO("Successfully fetched %zu bytes from Yahoo Images", chunk.size);
+ curl_easy_cleanup(curl_handle);
+ return chunk.response;
+}
+
+static char *get_json_field_internal(const char *json, const char *key) {
+ if (!json) return NULL;
+ char search_key[64];
+ snprintf(search_key, sizeof(search_key), "\"%s\":\"", key);
+ char *start = strstr(json, search_key);
+ if (!start) return NULL;
+ start += strlen(search_key);
+ char *end = strchr(start, '\"');
+ if (!end) return NULL;
+
+ size_t len = end - start;
+ char *val = (char *)malloc(len + 1);
+ if (!val) return NULL;
+
+ size_t j = 0;
+ for (size_t i = 0; i < len; i++) {
+ if (start[i] == '\\' && i + 1 < len && start[i + 1] == '/') {
+ val[j++] = '/';
+ i++;
+ } else {
+ val[j++] = start[i];
+ }
+ }
+ val[j] = '\0';
+ return val;
+}
+
+int images_handler(UrlParams *params) {
+ IMG_LOG_INFO("Start images_handler request processing");
+ TemplateContext ctx = new_context();
+ char *raw_query = "";
+
+ if (params) {
+ for (int i = 0; i < params->count; i++) {
+ if (strcmp(params->params[i].key, "q") == 0) {
+ raw_query = params->params[i].value;
+ break;
+ }
+ }
+ }
+
+ char *encoded_query = strdup(raw_query);
+
+ char *display_query = url_decode_query(raw_query);
+ context_set(&ctx, "query", display_query);
+
+ if (!encoded_query || strlen(encoded_query) == 0) {
+ IMG_LOG_INFO("Empty search query received, returning early warning");
+ send_response("<h1>No query provided</h1>");
+ if (encoded_query) free(encoded_query);
+ if (display_query) free(display_query);
+ free_context(&ctx);
+ return -1;
+ }
+
+ char url[1024];
+ snprintf(url, sizeof(url),
+ "https://images.search.yahoo.com/search/images?p=%s", encoded_query);
+
+ IMG_LOG_INFO("Requesting external HTML from Yahoo Images...");
+ char *html = fetch_images_html(url);
+ if (!html) {
+ IMG_LOG_ERROR("Failed to fetch image search results from Yahoo");
+ send_response("<h1>Error fetching images</h1>");
+ free(encoded_query);
+ free(display_query);
+ free_context(&ctx);
+ return -1;
+ }
+
+ IMG_LOG_INFO("Parsing HTML with libxml2...");
+ htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
+ if (!doc) {
+ IMG_LOG_ERROR("htmlReadMemory failed to create document pointer");
+ free(html);
+ free(encoded_query);
+ free(display_query);
+ free_context(&ctx);
+ return -1;
+ }
+
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+
+ if (!xpathCtx) {
+ IMG_LOG_ERROR("xmlXPathNewContext failed");
+ xmlFreeDoc(doc);
+ free(html);
+ free(encoded_query);
+ free(display_query);
+ free_context(&ctx);
+ return -1;
+ }
+
+ IMG_LOG_INFO("Executing XPath expression: //li[@data]");
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((const xmlChar *)"//li[@data]", xpathCtx);
+
+ int image_count = 0;
+ char ***image_matrix = NULL;
+ int *inner_counts = NULL;
+
+ if (xpathObj && xpathObj->nodesetval) {
+ int nodes = xpathObj->nodesetval->nodeNr;
+ IMG_LOG_INFO("XPath found %d potential image nodes", nodes);
+
+ int max_images = (nodes < 32) ? nodes : 32;
+ image_matrix = malloc(sizeof(char **) * max_images);
+ inner_counts = malloc(sizeof(int) * max_images);
+
+ for (int i = 0; i < nodes; i++) {
+ if (image_count >= 32) break;
+
+ xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
+ xmlChar *data_attr = xmlGetProp(node, (const xmlChar *)"data");
+ if (data_attr) {
+ char *iurl = get_json_field_internal((char *)data_attr, "iurl");
+ char *title = get_json_field_internal((char *)data_attr, "alt");
+ char *rurl = get_json_field_internal((char *)data_attr, "rurl");
+
+ if (iurl && strlen(iurl) > 0) {
+ image_matrix[image_count] = malloc(sizeof(char *) * 3);
+ image_matrix[image_count][0] = strdup(iurl);
+ image_matrix[image_count][1] = strdup(title ? title : "Image");
+ image_matrix[image_count][2] = strdup(rurl ? rurl : "#");
+ inner_counts[image_count] = 3;
+ image_count++;
+ }
+
+ if (iurl) free(iurl);
+ if (title) free(title);
+ if (rurl) free(rurl);
+ xmlFree(data_attr);
+ }
+ }
+ IMG_LOG_INFO("Successfully parsed %d valid image results (capped at 32)",
+ image_count);
+ } else {
+ IMG_LOG_INFO("No image nodes found in the HTML document");
+ }
+
+ IMG_LOG_INFO("Setting image array in template context...");
+ context_set_array_of_arrays(&ctx, "images", image_matrix, image_count,
+ inner_counts);
+
+ IMG_LOG_INFO("Rendering images.html template...");
+ char *rendered = render_template("images.html", &ctx);
+ if (rendered) {
+ IMG_LOG_INFO("Sending rendered template to client (%zu bytes)",
+ strlen(rendered));
+ send_response(rendered);
+ free(rendered);
+ } else {
+ IMG_LOG_ERROR("render_template returned NULL for images.html");
+ send_response("<h1>Error rendering image results</h1>");
+ }
+
+ IMG_LOG_INFO("Beginning memory cleanup...");
+
+ if (image_matrix) {
+ for (int i = 0; i < image_count; i++) {
+ for (int j = 0; j < 3; j++) {
+ free(image_matrix[i][j]);
+ }
+ free(image_matrix[i]);
+ }
+ free(image_matrix);
+ }
+ if (inner_counts) {
+ free(inner_counts);
+ }
+
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ if (xpathCtx) xmlXPathFreeContext(xpathCtx);
+ if (doc) xmlFreeDoc(doc);
+ free(html);
+ free(encoded_query);
+ free(display_query);
+ free_context(&ctx);
+
+ IMG_LOG_INFO("Images request cycle complete");
+ return 0;
+}
diff --git a/src/Routes/Images.h b/src/Routes/Images.h
new file mode 100644
index 0000000..86f4a31
--- /dev/null
+++ b/src/Routes/Images.h
@@ -0,0 +1,8 @@
+#ifndef IMAGES_HANDLER_H
+#define IMAGES_HANDLER_H
+
+#include <beaker.h>
+
+int images_handler(UrlParams *params);
+
+#endif
diff --git a/src/Routes/Search.c b/src/Routes/Search.c
new file mode 100644
index 0000000..110e6f7
--- /dev/null
+++ b/src/Routes/Search.c
@@ -0,0 +1,273 @@
+#include "Search.h"
+#include "../Infobox/Wikipedia.h"
+#include "../Infobox/Calculator.h"
+#include "../Scraping/Scraping.h"
+#include "../Utility/Display.h"
+#include "../Utility/Unescape.h"
+#include <ctype.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+typedef struct {
+ const SearchEngine *engine;
+ const char *query;
+ SearchResult *results;
+ int count;
+} EngineThreadData;
+
+static void *scrape_thread_func(void *arg) {
+ EngineThreadData *data = (EngineThreadData *)arg;
+ data->count = scrape_engine(data->engine, data->query, &data->results, 10);
+ return NULL;
+}
+
+typedef struct {
+ const char *query;
+ InfoBox result;
+ int success;
+} InfoBoxThreadData;
+
+static void *wiki_thread_func(void *arg) {
+ InfoBoxThreadData *data = (InfoBoxThreadData *)arg;
+ char *dynamic_url = construct_wiki_url(data->query);
+ if (dynamic_url) {
+ data->result = fetch_wiki_data(dynamic_url);
+ data->success =
+ (data->result.title != NULL && data->result.extract != NULL &&
+ strlen(data->result.extract) > 10);
+ free(dynamic_url);
+ } else {
+ data->success = 0;
+ }
+ return NULL;
+}
+
+static int is_calculator_query(const char *query) {
+ if (!query) return 0;
+
+ int has_digit = 0;
+ int has_operator = 0;
+
+ for (const char *p = query; *p; p++) {
+ if (isdigit(*p) || *p == '.') {
+ has_digit = 1;
+ }
+ if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '=' ||
+ *p == '^') {
+ has_operator = 1;
+ }
+ }
+
+ return has_digit && (has_operator || strchr(query, '.'));
+}
+
+static void *calc_thread_func(void *arg) {
+ InfoBoxThreadData *data = (InfoBoxThreadData *)arg;
+
+ if (is_calculator_query(data->query)) {
+ data->result = fetch_calc_data((char *)data->query);
+ data->success =
+ (data->result.title != NULL && data->result.extract != NULL);
+ } else {
+ data->success = 0;
+ }
+
+ return NULL;
+}
+
+static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
+ int **inner_counts, int current_count) {
+ *collection =
+ (char ***)realloc(*collection, sizeof(char **) * (current_count + 1));
+ *inner_counts =
+ (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1));
+
+ (*collection)[current_count] = (char **)malloc(sizeof(char *) * 4);
+ (*collection)[current_count][0] = infobox->title;
+ (*collection)[current_count][1] = infobox->thumbnail_url;
+ (*collection)[current_count][2] = infobox->extract;
+ (*collection)[current_count][3] = infobox->url;
+ (*inner_counts)[current_count] = 4;
+
+ return current_count + 1;
+}
+
+int results_handler(UrlParams *params) {
+ TemplateContext ctx = new_context();
+ char *raw_query = "";
+
+ if (params) {
+ for (int i = 0; i < params->count; i++) {
+ if (strcmp(params->params[i].key, "q") == 0) {
+ raw_query = params->params[i].value;
+ break;
+ }
+ }
+ }
+
+ char *encoded_query = strdup(raw_query);
+
+ char *display_query = url_decode_query(raw_query);
+ LOG_INFO("Processing search request for query: '%s'", display_query);
+ context_set(&ctx, "query", display_query);
+
+ if (!encoded_query || strlen(encoded_query) == 0) {
+ LOG_ERROR("Empty search query provided.");
+ send_response("<h1>No query provided</h1>");
+ if (encoded_query) free(encoded_query);
+ if (display_query) free(display_query);
+ free_context(&ctx);
+ return -1;
+ }
+
+ pthread_t wiki_tid, calc_tid;
+ InfoBoxThreadData wiki_data = {.query = display_query, .success = 0};
+ InfoBoxThreadData calc_data = {.query = display_query, .success = 0};
+
+ pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data);
+ pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data);
+
+ pthread_t engine_tids[ENGINE_COUNT];
+ EngineThreadData engine_data[ENGINE_COUNT];
+
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ engine_data[i].engine = &ENGINE_REGISTRY[i];
+ engine_data[i].query = encoded_query;
+
+ engine_data[i].results = NULL;
+ engine_data[i].count = 0;
+ pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]);
+ }
+
+ pthread_join(wiki_tid, NULL);
+ pthread_join(calc_tid, NULL);
+
+ char ***infobox_matrix = NULL;
+ int *infobox_inner_counts = NULL;
+ int infobox_count = 0;
+
+ if (calc_data.success) {
+ infobox_count =
+ add_infobox_to_collection(&calc_data.result, &infobox_matrix,
+ &infobox_inner_counts, infobox_count);
+ }
+
+ if (wiki_data.success) {
+ infobox_count =
+ add_infobox_to_collection(&wiki_data.result, &infobox_matrix,
+ &infobox_inner_counts, infobox_count);
+ }
+
+ if (infobox_count > 0) {
+ context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix,
+ infobox_count, infobox_inner_counts);
+ free(infobox_matrix);
+ free(infobox_inner_counts);
+ } else {
+ context_set_array_of_arrays(&ctx, "infoboxes", NULL, 0, NULL);
+ }
+
+ int total_results = 0;
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ pthread_join(engine_tids[i], NULL);
+ total_results += engine_data[i].count;
+ }
+
+ if (total_results > 0) {
+ char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results);
+ int *results_inner_counts = (int *)malloc(sizeof(int) * total_results);
+ char **seen_urls = (char **)malloc(sizeof(char *) * total_results);
+ int unique_count = 0;
+
+ for (int i = 0; i < ENGINE_COUNT; i++) {
+ for (int j = 0; j < engine_data[i].count; j++) {
+ char *raw_url = engine_data[i].results[j].url;
+ char *clean_url = unescape_search_url(raw_url);
+ char *display_url = clean_url ? clean_url : raw_url;
+
+ int is_duplicate = 0;
+ for (int k = 0; k < unique_count; k++) {
+ if (strcmp(seen_urls[k], display_url) == 0) {
+ is_duplicate = 1;
+ break;
+ }
+ }
+
+ if (is_duplicate) {
+ if (clean_url) free(clean_url);
+ free(engine_data[i].results[j].url);
+ free(engine_data[i].results[j].title);
+ free(engine_data[i].results[j].snippet);
+ continue;
+ }
+
+ seen_urls[unique_count] = strdup(display_url);
+ results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4);
+ char *pretty_url = pretty_display_url(display_url);
+
+ results_matrix[unique_count][0] = strdup(display_url);
+ results_matrix[unique_count][1] = strdup(pretty_url);
+ results_matrix[unique_count][2] =
+ engine_data[i].results[j].title
+ ? strdup(engine_data[i].results[j].title)
+ : strdup("Untitled");
+ results_matrix[unique_count][3] =
+ engine_data[i].results[j].snippet
+ ? strdup(engine_data[i].results[j].snippet)
+ : strdup("");
+
+ results_inner_counts[unique_count] = 4;
+
+ free(pretty_url);
+ free(engine_data[i].results[j].url);
+ free(engine_data[i].results[j].title);
+ free(engine_data[i].results[j].snippet);
+ if (clean_url) free(clean_url);
+
+ unique_count++;
+ }
+ free(engine_data[i].results);
+ }
+
+ context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count,
+ results_inner_counts);
+
+ char *html = render_template("results.html", &ctx);
+ if (html) {
+ send_response(html);
+ free(html);
+ }
+
+ for (int i = 0; i < unique_count; i++) {
+ for (int j = 0; j < 4; j++) free(results_matrix[i][j]);
+ free(results_matrix[i]);
+ free(seen_urls[i]);
+ }
+ free(seen_urls);
+ free(results_matrix);
+ free(results_inner_counts);
+ } else {
+ char *html = render_template("results.html", &ctx);
+ if (html) {
+ send_response(html);
+ free(html);
+ }
+ }
+
+ if (wiki_data.success) {
+ free_infobox(&wiki_data.result);
+ }
+
+ if (calc_data.success) {
+ free_infobox(&calc_data.result);
+ }
+
+ free(encoded_query);
+ free(display_query);
+ free_context(&ctx);
+
+ return 0;
+}
diff --git a/src/Routes/Search.h b/src/Routes/Search.h
new file mode 100644
index 0000000..c6bc146
--- /dev/null
+++ b/src/Routes/Search.h
@@ -0,0 +1,8 @@
+#ifndef SEARCH_HANDLER_H
+#define SEARCH_HANDLER_H
+
+#include <beaker.h>
+
+int results_handler(UrlParams *params);
+
+#endif
diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c
new file mode 100644
index 0000000..d2afea6
--- /dev/null
+++ b/src/Scraping/Scraping.c
@@ -0,0 +1,468 @@
+#include "Scraping.h"
+#include <curl/curl.h>
+#include <libxml/HTMLparser.h>
+#include <libxml/xpath.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+typedef struct {
+ char *memory;
+ size_t size;
+} MemoryBuffer;
+
+static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
+ void *userp) {
+ size_t realsize = size * nmemb;
+ MemoryBuffer *mem = (MemoryBuffer *)userp;
+
+ char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1);
+ if (ptr == NULL) {
+ LOG_ERROR("Not enough memory (realloc returned NULL)");
+ return 0;
+ }
+
+ mem->memory = ptr;
+ memcpy(&(mem->memory[mem->size]), contents, realsize);
+ mem->size += realsize;
+ mem->memory[mem->size] = 0;
+
+ return realsize;
+}
+
+static const char *get_random_user_agent() {
+ static const char *agents[] = {
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
+ "like Gecko) Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
+ "Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36",
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
+ "Firefox/121.0",
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
+ "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
+ return agents[rand() % 5];
+}
+
+static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *link_xpath = "//a[@class='result-link']";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No results found with XPath: %s", engine_name, link_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_links = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] XPath matched %d potential result links", engine_name,
+ num_links);
+
+ int actual_alloc = (num_links < max_results) ? num_links : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_links && found_count < max_results; i++) {
+ xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
+ char *title = (char *)xmlNodeGetContent(linkNode);
+ char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
+ char *snippet_text = NULL;
+
+ xmlNodePtr current = linkNode->parent;
+ while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
+ current = current->parent;
+
+ if (current && current->next) {
+ xmlNodePtr snippetRow = current->next;
+ while (snippetRow &&
+ xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
+ snippetRow = snippetRow->next;
+ if (snippetRow) {
+ xmlXPathContextPtr subCtx = xmlXPathNewContext(doc);
+ if (subCtx) {
+ subCtx->node = snippetRow;
+ xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
+ (xmlChar *)".//td[@class='result-snippet']", subCtx);
+ if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
+ snippet_text =
+ (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
+ }
+ if (sObj) xmlXPathFreeObject(sObj);
+ xmlXPathFreeContext(subCtx);
+ }
+ }
+ }
+
+ (*out_results)[found_count].url = strdup(url ? url : "");
+ (*out_results)[found_count].title = strdup(title ? title : "No Title");
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ (*out_results)[found_count].title);
+ found_count++;
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+static int parse_startpage(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *container_xpath = "//div[contains(@class, 'result')]";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No result containers found with XPath: %s", engine_name,
+ container_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_results = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] Found %d result containers", engine_name, num_results);
+
+ int actual_alloc = (num_results < max_results) ? num_results : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
+ xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
+ if (!resCtx) {
+ LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
+ i);
+ continue;
+ }
+ resCtx->node = resultNode;
+
+ xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
+ (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx);
+ char *url =
+ (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
+ (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx);
+ char *title =
+ (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
+ (xmlChar *)".//p[contains(@class, 'description')]", resCtx);
+ char *snippet_text =
+ (snippetObj && snippetObj->nodesetval &&
+ snippetObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (url && title) {
+ (*out_results)[found_count].url = strdup(url);
+ (*out_results)[found_count].title = strdup(title);
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ title);
+ found_count++;
+ } else {
+ LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
+ engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No");
+ }
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ if (linkObj) xmlXPathFreeObject(linkObj);
+ if (titleObj) xmlXPathFreeObject(titleObj);
+ if (snippetObj) xmlXPathFreeObject(snippetObj);
+ xmlXPathFreeContext(resCtx);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results) {
+ LOG_DEBUG("[%s] Starting XPath parsing...", engine_name);
+ int found_count = 0;
+ xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
+ if (!xpathCtx) {
+ LOG_ERROR("[%s] Could not create XPath context", engine_name);
+ return 0;
+ }
+
+ const char *container_xpath = "//div[contains(@class, 'algo-sr')]";
+ xmlXPathObjectPtr xpathObj =
+ xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
+
+ if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
+ LOG_WARN("[%s] No result containers found with XPath: %s", engine_name,
+ container_xpath);
+ if (xpathObj) xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ int num_results = xpathObj->nodesetval->nodeNr;
+ LOG_INFO("[%s] Found %d result containers", engine_name, num_results);
+
+ int actual_alloc = (num_results < max_results) ? num_results : max_results;
+ *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
+ if (!*out_results) {
+ LOG_ERROR("[%s] Failed to allocate memory for results", engine_name);
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return 0;
+ }
+
+ for (int i = 0; i < num_results && found_count < max_results; i++) {
+ xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
+ xmlXPathContextPtr resCtx = xmlXPathNewContext(doc);
+ if (!resCtx) {
+ LOG_ERROR("[%s] Failed to create result context for item %d", engine_name,
+ i);
+ continue;
+ }
+ resCtx->node = resultNode;
+
+ xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
+ (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
+ resCtx);
+ char *url =
+ (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
+ (xmlChar *)"href")
+ : NULL;
+
+ xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
+ (xmlChar *)".//h3[contains(@class, 'title')]", resCtx);
+ char *title =
+ (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
+ (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx);
+ char *snippet_text =
+ (snippetObj && snippetObj->nodesetval &&
+ snippetObj->nodesetval->nodeNr > 0)
+ ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
+ : NULL;
+
+ if (!url || !title) {
+ LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i,
+ url ? url : "(null)", title ? title : "(null)");
+ }
+
+ if (url && title) {
+ (*out_results)[found_count].url = strdup(url);
+ (*out_results)[found_count].title = strdup(title);
+ (*out_results)[found_count].snippet =
+ strdup(snippet_text ? snippet_text : "");
+ LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1,
+ title);
+ found_count++;
+ } else {
+ LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s",
+ engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No");
+ }
+
+ if (title) xmlFree(title);
+ if (url) xmlFree(url);
+ if (snippet_text) xmlFree(snippet_text);
+ if (linkObj) xmlXPathFreeObject(linkObj);
+ if (titleObj) xmlXPathFreeObject(titleObj);
+ if (snippetObj) xmlXPathFreeObject(snippetObj);
+ xmlXPathFreeContext(resCtx);
+ }
+
+ xmlXPathFreeObject(xpathObj);
+ xmlXPathFreeContext(xpathCtx);
+ return found_count;
+}
+
+const SearchEngine ENGINE_REGISTRY[] = {
+ {.name = "DuckDuckGo Lite",
+ .base_url = "https://lite.duckduckgo.com/lite/?q=",
+ .host_header = "lite.duckduckgo.com",
+ .referer = "https://lite.duckduckgo.com/",
+ .parser = parse_ddg_lite},
+ {.name = "Startpage",
+ .base_url = "https://www.startpage.com/sp/search?q=",
+ .host_header = "www.startpage.com",
+ .referer = "https://www.startpage.com/",
+ .parser = parse_startpage},
+ {.name = "Yahoo",
+ .base_url = "https://search.yahoo.com/search?p=",
+ .host_header = "search.yahoo.com",
+ .referer = "https://search.yahoo.com/",
+ .parser = parse_yahoo}};
+
+const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+ SearchResult **out_results, int max_results) {
+ CURL *curl;
+ MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0};
+ int results_count = 0;
+
+ LOG_INFO("--- Starting scrape for engine: %s ---", engine->name);
+ LOG_INFO("[%s] Query: '%s'", engine->name, query);
+
+ if (!chunk.memory) {
+ LOG_ERROR("Initial memory allocation failed");
+ return -1;
+ }
+
+ curl = curl_easy_init();
+
+ if (curl && query) {
+ char full_url[1024];
+ char *encoded_query = curl_easy_escape(curl, query, 0);
+ if (!encoded_query) {
+ LOG_ERROR("[%s] Failed to encode query", engine->name);
+ curl_easy_cleanup(curl);
+ free(chunk.memory);
+ return -1;
+ }
+ snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url,
+ encoded_query);
+ curl_free(encoded_query);
+
+ LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url);
+
+ struct curl_slist *headers = NULL;
+ char host_buf[256], ref_buf[256];
+ snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header);
+ snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer);
+
+ headers = curl_slist_append(headers, host_buf);
+ headers = curl_slist_append(headers, ref_buf);
+ headers = curl_slist_append(headers,
+ "Accept: "
+ "text/html,application/xhtml+xml,application/"
+ "xml;q=0.9,image/avif,image/webp,*/*;q=0.8");
+ headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
+ headers = curl_slist_append(headers, "DNT: 1");
+ headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1");
+ headers = curl_slist_append(headers, "Sec-Fetch-Dest: document");
+ headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate");
+ headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin");
+ headers = curl_slist_append(headers, "Connection: keep-alive");
+
+ curl_easy_setopt(curl, CURLOPT_URL, full_url);
+ curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
+ curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
+ curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk);
+
+ const char *ua = get_random_user_agent();
+ LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua);
+ curl_easy_setopt(curl, CURLOPT_USERAGENT, ua);
+
+ curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
+ curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
+
+ curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
+
+ LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name);
+ usleep(500000 + (rand() % 1000000));
+
+ CURLcode res = curl_easy_perform(curl);
+
+ if (res != CURLE_OK) {
+ LOG_ERROR("[%s] libcurl error: %s", engine->name,
+ curl_easy_strerror(res));
+ } else {
+ long response_code;
+ curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
+ LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code);
+
+ if (chunk.size > 0) {
+ xmlDocPtr doc = htmlReadMemory(
+ chunk.memory, chunk.size, NULL, NULL,
+ HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
+ if (doc) {
+ results_count =
+ engine->parser(engine->name, doc, out_results, max_results);
+ xmlFreeDoc(doc);
+ }
+ }
+ }
+
+ if (results_count <= 0) {
+ LOG_WARN("[%s] No results found. Generating skeleton fallback.",
+ engine->name);
+ *out_results = (SearchResult *)malloc(sizeof(SearchResult));
+ if (*out_results) {
+ char fallback_msg[512];
+ snprintf(fallback_msg, sizeof(fallback_msg),
+ "Search %s manually for '%s'", engine->name, query);
+
+ (*out_results)[0].title = strdup(fallback_msg);
+ (*out_results)[0].url = strdup(full_url);
+ (*out_results)[0].snippet = strdup(
+ "Automated results were blocked by a Captcha or anti-bot "
+ "challenge. Click the link above to perform the search "
+ "manually in your browser.");
+ results_count = 1;
+ }
+ }
+
+ curl_slist_free_all(headers);
+ curl_easy_cleanup(curl);
+ } else {
+ if (curl) {
+ curl_easy_cleanup(curl);
+ }
+ }
+
+ free(chunk.memory);
+
+ return results_count;
+}
diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h
new file mode 100644
index 0000000..7ad4d59
--- /dev/null
+++ b/src/Scraping/Scraping.h
@@ -0,0 +1,34 @@
+#ifndef SCRAPING_H
+#define SCRAPING_H
+
+#include <libxml/HTMLparser.h>
+
+#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
+#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
+#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__)
+#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__)
+
+typedef struct {
+ char *url;
+ char *title;
+ char *snippet;
+} SearchResult;
+
+typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc,
+ SearchResult **out_results, int max_results);
+
+typedef struct {
+ const char *name;
+ const char *base_url;
+ const char *host_header;
+ const char *referer;
+ ParserFunc parser;
+} SearchEngine;
+
+extern const SearchEngine ENGINE_REGISTRY[];
+extern const int ENGINE_COUNT;
+
+int scrape_engine(const SearchEngine *engine, const char *query,
+ SearchResult **out_results, int max_results);
+
+#endif
diff --git a/src/Utility/Display.c b/src/Utility/Display.c
new file mode 100644
index 0000000..492e998
--- /dev/null
+++ b/src/Utility/Display.c
@@ -0,0 +1,46 @@
+#include "Display.h"
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+
+char *pretty_display_url(const char *input) {
+ if (!input) return NULL;
+
+ const char *start = input;
+
+ const char *protocol_pos = strstr(input, "://");
+ if (protocol_pos) {
+ start = protocol_pos + 3;
+ }
+
+ if (strncasecmp(start, "www.", 4) == 0) {
+ start += 4;
+ }
+
+ size_t input_len = strlen(start);
+ char temp[512];
+ strncpy(temp, start, sizeof(temp) - 1);
+ temp[sizeof(temp) - 1] = '\0';
+
+ if (input_len > 0 && temp[input_len - 1] == '/') {
+ temp[input_len - 1] = '\0';
+ }
+
+ char *output = (char *)malloc(strlen(temp) * 3 + 1);
+ if (!output) return NULL;
+
+ size_t j = 0;
+ for (size_t i = 0; temp[i] != '\0'; i++) {
+ if (temp[i] == '/') {
+ output[j++] = ' ';
+ output[j++] = '>';
+ output[j++] = ' ';
+ } else {
+ output[j++] = (char)tolower((unsigned char)temp[i]);
+ }
+ }
+ output[j] = '\0';
+
+ return output;
+}
diff --git a/src/Utility/Display.h b/src/Utility/Display.h
new file mode 100644
index 0000000..bbaf421
--- /dev/null
+++ b/src/Utility/Display.h
@@ -0,0 +1,6 @@
+#ifndef DISPLAY_H
+#define DISPLAY_H
+
+char *pretty_display_url(const char *input);
+
+#endif
diff --git a/src/Utility/Unescape.c b/src/Utility/Unescape.c
new file mode 100644
index 0000000..e2968b2
--- /dev/null
+++ b/src/Utility/Unescape.c
@@ -0,0 +1,80 @@
+#include "Unescape.h"
+#include "Utility.h"
+#include <stdlib.h>
+#include <string.h>
+
+char *unescape_search_url(const char *input) {
+ if (!input) return NULL;
+
+ const char *key = NULL;
+ const char *start = NULL;
+ const char *end = NULL;
+ size_t len = 0;
+
+ if (strstr(input, "uddg=")) {
+ key = "uddg=";
+ start = strstr(input, key);
+ if (!start) return NULL;
+ start += strlen(key);
+ end = strchr(start, '&');
+ len = end ? (size_t)(end - start) : strlen(start);
+ }
+
+ else if (strstr(input, "RU=")) {
+ key = "RU=";
+ start = strstr(input, key);
+ if (!start) return strdup(input);
+ start += strlen(key);
+ end = strchr(start, '/');
+ len = end ? (size_t)(end - start) : strlen(start);
+ }
+
+ else {
+ return strdup(input);
+ }
+
+ char *output = (char *)malloc(len * 3 + 1);
+ if (!output) return NULL;
+
+ size_t i = 0, j = 0;
+ while (i < len) {
+ if (start[i] == '%' && i + 2 < len) {
+ int high = hex_to_int(start[i + 1]);
+ int low = hex_to_int(start[i + 2]);
+ if (high != -1 && low != -1) {
+ output[j++] = (char)((high << 4) | low);
+ i += 3;
+ } else {
+ output[j++] = start[i++];
+ }
+ } else if (start[i] == '+') {
+ output[j++] = ' ';
+ i++;
+ } else {
+ output[j++] = start[i++];
+ }
+ }
+ output[j] = '\0';
+
+ return output;
+}
+
+char *url_decode_query(const char *src) {
+ if (!src) return NULL;
+ char *res = strdup(src);
+ char *p = res;
+ while (*src) {
+ if (*src == '+') {
+ *p++ = ' ';
+ } else if (*src == '%' && src[1] && src[2]) {
+ char hex[3] = {src[1], src[2], '\0'};
+ *p++ = (char)strtol(hex, NULL, 16);
+ src += 2;
+ } else {
+ *p++ = *src;
+ }
+ src++;
+ }
+ *p = '\0';
+ return res;
+}
diff --git a/src/Utility/Unescape.h b/src/Utility/Unescape.h
new file mode 100644
index 0000000..0adb228
--- /dev/null
+++ b/src/Utility/Unescape.h
@@ -0,0 +1,10 @@
+#ifndef UNESCAPE_H
+#define UNESCAPE_H
+
+#include <stddef.h>
+
+char *unescape_search_url(const char *input);
+char *url_decode_query(const char *src);
+
+#endif
+
diff --git a/src/Utility/Utility.c b/src/Utility/Utility.c
new file mode 100644
index 0000000..8e5af92
--- /dev/null
+++ b/src/Utility/Utility.c
@@ -0,0 +1,8 @@
+#include "Utility.h"
+
+int hex_to_int(char c) {
+ if (c >= '0' && c <= '9') return c - '0';
+ if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+ if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+ return -1;
+}
diff --git a/src/Utility/Utility.h b/src/Utility/Utility.h
new file mode 100644
index 0000000..3b0181c
--- /dev/null
+++ b/src/Utility/Utility.h
@@ -0,0 +1,6 @@
+#ifndef UTILITY_H
+#define UTILITY_H
+
+int hex_to_int(char c);
+
+#endif