diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Config.h | 2 | ||||
| -rw-r--r-- | src/Infobox/Calculator.c | 115 | ||||
| -rw-r--r-- | src/Infobox/Calculator.h | 9 | ||||
| -rw-r--r-- | src/Infobox/Infobox.c | 13 | ||||
| -rw-r--r-- | src/Infobox/Infobox.h | 13 | ||||
| -rw-r--r-- | src/Infobox/Wikipedia.c | 165 | ||||
| -rw-r--r-- | src/Infobox/Wikipedia.h | 9 | ||||
| -rw-r--r-- | src/Main.c | 36 | ||||
| -rw-r--r-- | src/Routes/Home.c | 14 | ||||
| -rw-r--r-- | src/Routes/Home.h | 8 | ||||
| -rw-r--r-- | src/Routes/Images.c | 277 | ||||
| -rw-r--r-- | src/Routes/Images.h | 8 | ||||
| -rw-r--r-- | src/Routes/Search.c | 273 | ||||
| -rw-r--r-- | src/Routes/Search.h | 8 | ||||
| -rw-r--r-- | src/Scraping/Scraping.c | 468 | ||||
| -rw-r--r-- | src/Scraping/Scraping.h | 34 | ||||
| -rw-r--r-- | src/Utility/Display.c | 46 | ||||
| -rw-r--r-- | src/Utility/Display.h | 6 | ||||
| -rw-r--r-- | src/Utility/Unescape.c | 80 | ||||
| -rw-r--r-- | src/Utility/Unescape.h | 10 | ||||
| -rw-r--r-- | src/Utility/Utility.c | 8 | ||||
| -rw-r--r-- | src/Utility/Utility.h | 6 |
22 files changed, 1608 insertions, 0 deletions
diff --git a/src/Config.h b/src/Config.h new file mode 100644 index 0000000..b5695b7 --- /dev/null +++ b/src/Config.h @@ -0,0 +1,2 @@ +static int port = 5000; +static char host[] = "0.0.0.0";
\ No newline at end of file diff --git a/src/Infobox/Calculator.c b/src/Infobox/Calculator.c new file mode 100644 index 0000000..b80ce21 --- /dev/null +++ b/src/Infobox/Calculator.c @@ -0,0 +1,115 @@ +#include "Calculator.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <ctype.h> + +static char logic_log[4096]; + +typedef struct { + const char *buffer; + int pos; +} Parser; + +static double parse_expression(Parser *p); + +static void skip_ws(Parser *p) { + while (p->buffer[p->pos] == ' ') p->pos++; +} + +static double parse_factor(Parser *p) { + skip_ws(p); + if (p->buffer[p->pos] == '-') { + p->pos++; + return -parse_factor(p); + } + if (p->buffer[p->pos] == '(') { + p->pos++; + double res = parse_expression(p); + if (p->buffer[p->pos] == ')') p->pos++; + return res; + } + char *endptr; + double val = strtod(&p->buffer[p->pos], &endptr); + p->pos = (int)(endptr - p->buffer); + return val; +} + +static double parse_term(Parser *p) { + double left = parse_factor(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '*' || op == '/') { + p->pos++; + double right = parse_factor(p); + double old = left; + left = (op == '*') ? left * right : left / right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; +} + +static double parse_expression(Parser *p) { + double left = parse_term(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '+' || op == '-') { + p->pos++; + double right = parse_term(p); + double old = left; + left = (op == '+') ? left + right : left - right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; +} + +double evaluate(const char *expr) { + logic_log[0] = '\0'; + if (!expr || strlen(expr) == 0) return 0.0; + Parser p = {expr, 0}; + return parse_expression(&p); +} + +InfoBox fetch_calc_data(char *math_input) { + InfoBox info = {NULL, NULL, NULL, NULL}; + if (!math_input) return info; + + double result = evaluate(math_input); + + char html_output[5120]; + snprintf(html_output, sizeof(html_output), + "<div class='calc-container' style='line-height: 1.6;'>" + "%s" + "<div style='margin-top: 8px; border-top: 1px solid #eee; " + "padding-top: 8px; font-size: 1.2em;'>" + "<b>%g</b>" + "</div>" + "</div>", + strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>", + result); + + info.title = strdup("Calculation"); + info.extract = strdup(html_output); + info.thumbnail_url = + strdup("/static/calculation.svg"); + info.url = strdup("#"); + + return info; +} diff --git a/src/Infobox/Calculator.h b/src/Infobox/Calculator.h new file mode 100644 index 0000000..275aed6 --- /dev/null +++ b/src/Infobox/Calculator.h @@ -0,0 +1,9 @@ +#ifndef CALCULATOR_H +#define CALCULATOR_H + +#include "Infobox.h" + +double evaluate(const char *expr); +InfoBox fetch_calc_data(char *math_input); + +#endif diff --git a/src/Infobox/Infobox.c b/src/Infobox/Infobox.c new file mode 100644 index 0000000..5043c05 --- /dev/null +++ b/src/Infobox/Infobox.c @@ -0,0 +1,13 @@ +#include "Infobox.h" +#include <stdlib.h> + +void free_infobox(InfoBox *info) { + if (info->title) + free(info->title); + if (info->thumbnail_url) + free(info->thumbnail_url); + if (info->extract) + free(info->extract); + if (info->url) + free(info->url); +} diff --git a/src/Infobox/Infobox.h b/src/Infobox/Infobox.h new file mode 100644 index 0000000..a052b80 --- /dev/null +++ b/src/Infobox/Infobox.h @@ -0,0 +1,13 @@ +#ifndef INFOBOX_H +#define INFOBOX_H + +typedef struct { + char *title; + char *thumbnail_url; + char *extract; + char *url; +} InfoBox; + +void free_infobox(InfoBox *info); + +#endif diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c new file mode 100644 index 0000000..ed4645f --- /dev/null +++ b/src/Infobox/Wikipedia.c @@ -0,0 +1,165 @@ +#include "Wikipedia.h" +#include <curl/curl.h> +#include <libxml/parser.h> +#include <libxml/tree.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct WikiMemoryStruct { + char *memory; + size_t size; +}; + +static void shorten_summary(char **extract_ptr, int max_chars) { + if (!extract_ptr || !*extract_ptr) return; + + char *text = *extract_ptr; + int len = strlen(text); + + if (len <= max_chars) return; + + int end_pos = max_chars; + for (int i = max_chars; i > (max_chars / 2); i--) { + if (text[i] == '.' || text[i] == '!' || text[i] == '?') { + end_pos = i + 1; + break; + } + } + + char *new_text = (char *)malloc(end_pos + 4); + + if (new_text) { + strncpy(new_text, text, end_pos); + new_text[end_pos] = '\0'; + strcat(new_text, "..."); + free(*extract_ptr); + *extract_ptr = new_text; + } +} + +static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp; + + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (ptr == NULL) { + fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static void extract_wiki_info(xmlNode *node, InfoBox *info) { + xmlNode *cur_node = NULL; + + for (cur_node = node; cur_node; cur_node = cur_node->next) { + if (cur_node->type == XML_ELEMENT_NODE) { + if (strcmp((const char *)cur_node->name, "page") == 0) { + xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title"); + if (title) { + info->title = strdup((const char *)title); + + const char *base_article_url = "https://en.wikipedia.org/wiki/"; + char *formatted_title = strdup((const char *)title); + for (int i = 0; formatted_title[i]; i++) { + if (formatted_title[i] == ' ') formatted_title[i] = '_'; + } + + info->url = + malloc(strlen(base_article_url) + strlen(formatted_title) + 1); + if (info->url) { + strcpy(info->url, base_article_url); + strcat(info->url, formatted_title); + } + free(formatted_title); + xmlFree(title); + } + } + + if (strcmp((const char *)cur_node->name, "thumbnail") == 0) { + xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source"); + if (source) { + info->thumbnail_url = strdup((const char *)source); + xmlFree(source); + } + } + + if (strcmp((const char *)cur_node->name, "extract") == 0) { + xmlChar *content = xmlNodeGetContent(cur_node); + if (content) { + info->extract = strdup((const char *)content); + + shorten_summary(&(info->extract), 300); + xmlFree(content); + } + } + } + extract_wiki_info(cur_node->children, info); + } +} + +InfoBox fetch_wiki_data(char *api_url) { + CURL *curl_handle; + CURLcode res; + struct WikiMemoryStruct chunk; + InfoBox info = {NULL, NULL, NULL, NULL}; + + chunk.memory = malloc(1); + chunk.size = 0; + + curl_handle = curl_easy_init(); + + if (curl_handle) { + curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, + WikiWriteMemoryCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + + res = curl_easy_perform(curl_handle); + + if (res == CURLE_OK) { + xmlDocPtr doc = + xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); + } + } + + curl_easy_cleanup(curl_handle); + free(chunk.memory); + } + + return info; +} + +char *construct_wiki_url(const char *search_term) { + CURL *curl = curl_easy_init(); + if (!curl) return NULL; + + char *escaped_term = curl_easy_escape(curl, search_term, 0); + const char *base = + "https://en.wikipedia.org/w/" + "api.php?action=query&prop=extracts|pageimages&exintro&" + "explaintext&pithumbsize=400&format=xml&origin=*&titles="; + + char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1); + if (full_url) { + strcpy(full_url, base); + strcat(full_url, escaped_term); + } + + curl_free(escaped_term); + curl_easy_cleanup(curl); + return full_url; +} diff --git a/src/Infobox/Wikipedia.h b/src/Infobox/Wikipedia.h new file mode 100644 index 0000000..8a8103e --- /dev/null +++ b/src/Infobox/Wikipedia.h @@ -0,0 +1,9 @@ +#ifndef WIKIPEDIA_H +#define WIKIPEDIA_H + +#include "Infobox.h" + +InfoBox fetch_wiki_data(char *api_url); +char *construct_wiki_url(const char *search_term); + +#endif diff --git a/src/Main.c b/src/Main.c new file mode 100644 index 0000000..ad08f3e --- /dev/null +++ b/src/Main.c @@ -0,0 +1,36 @@ +#include <beaker.h> +#include <curl/curl.h> +#include <libxml/parser.h> +#include <stdio.h> +#include <stdlib.h> + +#include "Config.h" +#include "Routes/Home.h" +#include "Routes/Images.h" +#include "Routes/Search.h" + +int main() { + LIBXML_TEST_VERSION + xmlInitParser(); + + curl_global_init(CURL_GLOBAL_DEFAULT); + + set_handler("/", home_handler); + set_handler("/search", results_handler); + set_handler("/images", images_handler); + + fprintf(stderr, "Starting Omnisearch on %s:%d\n", host, port); + + int result = beaker_run(host, port); + + if (result != 0) { + fprintf(stderr, "Error: Beaker server failed to start.\n"); + curl_global_cleanup(); + xmlCleanupParser(); + return EXIT_FAILURE; + } + + curl_global_cleanup(); + xmlCleanupParser(); + return EXIT_SUCCESS; +} diff --git a/src/Routes/Home.c b/src/Routes/Home.c new file mode 100644 index 0000000..81370ba --- /dev/null +++ b/src/Routes/Home.c @@ -0,0 +1,14 @@ +#include "Home.h" +#include <stdlib.h> + +int home_handler(UrlParams *params) { + (void)params; + TemplateContext ctx = new_context(); + char *rendered_html = render_template("home.html", &ctx); + send_response(rendered_html); + + free(rendered_html); + free_context(&ctx); + + return 0; +} diff --git a/src/Routes/Home.h b/src/Routes/Home.h new file mode 100644 index 0000000..5d01ab3 --- /dev/null +++ b/src/Routes/Home.h @@ -0,0 +1,8 @@ +#ifndef HOME_H +#define HOME_H + +#include <beaker.h> + +int home_handler(UrlParams *params); + +#endif diff --git a/src/Routes/Images.c b/src/Routes/Images.c new file mode 100644 index 0000000..47e3a72 --- /dev/null +++ b/src/Routes/Images.c @@ -0,0 +1,277 @@ +#include "Images.h" +#include "../Utility/Unescape.h" + +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +static void get_image_timestamp(char *buffer, size_t size) { + time_t now = time(NULL); + struct tm *t = localtime(&now); + if (t) { + strftime(buffer, size, "%Y-%m-%d %H:%M:%S", t); + } +} + +#define IMG_LOG_INFO(msg, ...) \ + { \ + char ts[20]; \ + get_image_timestamp(ts, sizeof(ts)); \ + fprintf(stderr, "[%s] INFO [ImagesHandler] " msg "\n", ts, \ + ##__VA_ARGS__); \ + } + +#define IMG_LOG_ERROR(msg, ...) \ + { \ + char ts[20]; \ + get_image_timestamp(ts, sizeof(ts)); \ + fprintf(stderr, "[%s] ERROR [ImagesHandler] " msg "\n", ts, \ + ##__VA_ARGS__); \ + } + +struct MemoryBlock { + char *response; + size_t size; +}; + +static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + struct MemoryBlock *mem = (struct MemoryBlock *)userp; + char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1); + if (ptr == NULL) { + IMG_LOG_ERROR("Realloc failed in WriteCallback (out of memory)"); + return 0; + } + mem->response = ptr; + memcpy(&(mem->response[mem->size]), data, realsize); + mem->size += realsize; + mem->response[mem->size] = 0; + return realsize; +} + +static char *fetch_images_html(const char *url) { + CURL *curl_handle; + struct MemoryBlock chunk = {.response = malloc(1), .size = 0}; + if (!chunk.response) { + IMG_LOG_ERROR("Initial malloc failed for fetch_images_html"); + return NULL; + } + + IMG_LOG_INFO("Initializing cURL handle for URL: %s", url); + curl_handle = curl_easy_init(); + if (!curl_handle) { + IMG_LOG_ERROR("curl_easy_init() failed"); + free(chunk.response); + return NULL; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt( + curl_handle, CURLOPT_USERAGENT, + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"); + curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L); + + CURLcode res = curl_easy_perform(curl_handle); + if (res != CURLE_OK) { + IMG_LOG_ERROR("curl_easy_perform() failed: %s", curl_easy_strerror(res)); + free(chunk.response); + curl_easy_cleanup(curl_handle); + return NULL; + } + + IMG_LOG_INFO("Successfully fetched %zu bytes from Yahoo Images", chunk.size); + curl_easy_cleanup(curl_handle); + return chunk.response; +} + +static char *get_json_field_internal(const char *json, const char *key) { + if (!json) return NULL; + char search_key[64]; + snprintf(search_key, sizeof(search_key), "\"%s\":\"", key); + char *start = strstr(json, search_key); + if (!start) return NULL; + start += strlen(search_key); + char *end = strchr(start, '\"'); + if (!end) return NULL; + + size_t len = end - start; + char *val = (char *)malloc(len + 1); + if (!val) return NULL; + + size_t j = 0; + for (size_t i = 0; i < len; i++) { + if (start[i] == '\\' && i + 1 < len && start[i + 1] == '/') { + val[j++] = '/'; + i++; + } else { + val[j++] = start[i]; + } + } + val[j] = '\0'; + return val; +} + +int images_handler(UrlParams *params) { + IMG_LOG_INFO("Start images_handler request processing"); + TemplateContext ctx = new_context(); + char *raw_query = ""; + + if (params) { + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + break; + } + } + } + + char *encoded_query = strdup(raw_query); + + char *display_query = url_decode_query(raw_query); + context_set(&ctx, "query", display_query); + + if (!encoded_query || strlen(encoded_query) == 0) { + IMG_LOG_INFO("Empty search query received, returning early warning"); + send_response("<h1>No query provided</h1>"); + if (encoded_query) free(encoded_query); + if (display_query) free(display_query); + free_context(&ctx); + return -1; + } + + char url[1024]; + snprintf(url, sizeof(url), + "https://images.search.yahoo.com/search/images?p=%s", encoded_query); + + IMG_LOG_INFO("Requesting external HTML from Yahoo Images..."); + char *html = fetch_images_html(url); + if (!html) { + IMG_LOG_ERROR("Failed to fetch image search results from Yahoo"); + send_response("<h1>Error fetching images</h1>"); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + IMG_LOG_INFO("Parsing HTML with libxml2..."); + htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); + if (!doc) { + IMG_LOG_ERROR("htmlReadMemory failed to create document pointer"); + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + + if (!xpathCtx) { + IMG_LOG_ERROR("xmlXPathNewContext failed"); + xmlFreeDoc(doc); + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + IMG_LOG_INFO("Executing XPath expression: //li[@data]"); + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((const xmlChar *)"//li[@data]", xpathCtx); + + int image_count = 0; + char ***image_matrix = NULL; + int *inner_counts = NULL; + + if (xpathObj && xpathObj->nodesetval) { + int nodes = xpathObj->nodesetval->nodeNr; + IMG_LOG_INFO("XPath found %d potential image nodes", nodes); + + int max_images = (nodes < 32) ? nodes : 32; + image_matrix = malloc(sizeof(char **) * max_images); + inner_counts = malloc(sizeof(int) * max_images); + + for (int i = 0; i < nodes; i++) { + if (image_count >= 32) break; + + xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; + xmlChar *data_attr = xmlGetProp(node, (const xmlChar *)"data"); + if (data_attr) { + char *iurl = get_json_field_internal((char *)data_attr, "iurl"); + char *title = get_json_field_internal((char *)data_attr, "alt"); + char *rurl = get_json_field_internal((char *)data_attr, "rurl"); + + if (iurl && strlen(iurl) > 0) { + image_matrix[image_count] = malloc(sizeof(char *) * 3); + image_matrix[image_count][0] = strdup(iurl); + image_matrix[image_count][1] = strdup(title ? title : "Image"); + image_matrix[image_count][2] = strdup(rurl ? rurl : "#"); + inner_counts[image_count] = 3; + image_count++; + } + + if (iurl) free(iurl); + if (title) free(title); + if (rurl) free(rurl); + xmlFree(data_attr); + } + } + IMG_LOG_INFO("Successfully parsed %d valid image results (capped at 32)", + image_count); + } else { + IMG_LOG_INFO("No image nodes found in the HTML document"); + } + + IMG_LOG_INFO("Setting image array in template context..."); + context_set_array_of_arrays(&ctx, "images", image_matrix, image_count, + inner_counts); + + IMG_LOG_INFO("Rendering images.html template..."); + char *rendered = render_template("images.html", &ctx); + if (rendered) { + IMG_LOG_INFO("Sending rendered template to client (%zu bytes)", + strlen(rendered)); + send_response(rendered); + free(rendered); + } else { + IMG_LOG_ERROR("render_template returned NULL for images.html"); + send_response("<h1>Error rendering image results</h1>"); + } + + IMG_LOG_INFO("Beginning memory cleanup..."); + + if (image_matrix) { + for (int i = 0; i < image_count; i++) { + for (int j = 0; j < 3; j++) { + free(image_matrix[i][j]); + } + free(image_matrix[i]); + } + free(image_matrix); + } + if (inner_counts) { + free(inner_counts); + } + + if (xpathObj) xmlXPathFreeObject(xpathObj); + if (xpathCtx) xmlXPathFreeContext(xpathCtx); + if (doc) xmlFreeDoc(doc); + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + + IMG_LOG_INFO("Images request cycle complete"); + return 0; +} diff --git a/src/Routes/Images.h b/src/Routes/Images.h new file mode 100644 index 0000000..86f4a31 --- /dev/null +++ b/src/Routes/Images.h @@ -0,0 +1,8 @@ +#ifndef IMAGES_HANDLER_H +#define IMAGES_HANDLER_H + +#include <beaker.h> + +int images_handler(UrlParams *params); + +#endif diff --git a/src/Routes/Search.c b/src/Routes/Search.c new file mode 100644 index 0000000..110e6f7 --- /dev/null +++ b/src/Routes/Search.c @@ -0,0 +1,273 @@ +#include "Search.h" +#include "../Infobox/Wikipedia.h" +#include "../Infobox/Calculator.h" +#include "../Scraping/Scraping.h" +#include "../Utility/Display.h" +#include "../Utility/Unescape.h" +#include <ctype.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +typedef struct { + const SearchEngine *engine; + const char *query; + SearchResult *results; + int count; +} EngineThreadData; + +static void *scrape_thread_func(void *arg) { + EngineThreadData *data = (EngineThreadData *)arg; + data->count = scrape_engine(data->engine, data->query, &data->results, 10); + return NULL; +} + +typedef struct { + const char *query; + InfoBox result; + int success; +} InfoBoxThreadData; + +static void *wiki_thread_func(void *arg) { + InfoBoxThreadData *data = (InfoBoxThreadData *)arg; + char *dynamic_url = construct_wiki_url(data->query); + if (dynamic_url) { + data->result = fetch_wiki_data(dynamic_url); + data->success = + (data->result.title != NULL && data->result.extract != NULL && + strlen(data->result.extract) > 10); + free(dynamic_url); + } else { + data->success = 0; + } + return NULL; +} + +static int is_calculator_query(const char *query) { + if (!query) return 0; + + int has_digit = 0; + int has_operator = 0; + + for (const char *p = query; *p; p++) { + if (isdigit(*p) || *p == '.') { + has_digit = 1; + } + if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '=' || + *p == '^') { + has_operator = 1; + } + } + + return has_digit && (has_operator || strchr(query, '.')); +} + +static void *calc_thread_func(void *arg) { + InfoBoxThreadData *data = (InfoBoxThreadData *)arg; + + if (is_calculator_query(data->query)) { + data->result = fetch_calc_data((char *)data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); + } else { + data->success = 0; + } + + return NULL; +} + +static int add_infobox_to_collection(InfoBox *infobox, char ****collection, + int **inner_counts, int current_count) { + *collection = + (char ***)realloc(*collection, sizeof(char **) * (current_count + 1)); + *inner_counts = + (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1)); + + (*collection)[current_count] = (char **)malloc(sizeof(char *) * 4); + (*collection)[current_count][0] = infobox->title; + (*collection)[current_count][1] = infobox->thumbnail_url; + (*collection)[current_count][2] = infobox->extract; + (*collection)[current_count][3] = infobox->url; + (*inner_counts)[current_count] = 4; + + return current_count + 1; +} + +int results_handler(UrlParams *params) { + TemplateContext ctx = new_context(); + char *raw_query = ""; + + if (params) { + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + break; + } + } + } + + char *encoded_query = strdup(raw_query); + + char *display_query = url_decode_query(raw_query); + LOG_INFO("Processing search request for query: '%s'", display_query); + context_set(&ctx, "query", display_query); + + if (!encoded_query || strlen(encoded_query) == 0) { + LOG_ERROR("Empty search query provided."); + send_response("<h1>No query provided</h1>"); + if (encoded_query) free(encoded_query); + if (display_query) free(display_query); + free_context(&ctx); + return -1; + } + + pthread_t wiki_tid, calc_tid; + InfoBoxThreadData wiki_data = {.query = display_query, .success = 0}; + InfoBoxThreadData calc_data = {.query = display_query, .success = 0}; + + pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); + pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); + + pthread_t engine_tids[ENGINE_COUNT]; + EngineThreadData engine_data[ENGINE_COUNT]; + + for (int i = 0; i < ENGINE_COUNT; i++) { + engine_data[i].engine = &ENGINE_REGISTRY[i]; + engine_data[i].query = encoded_query; + + engine_data[i].results = NULL; + engine_data[i].count = 0; + pthread_create(&engine_tids[i], NULL, scrape_thread_func, &engine_data[i]); + } + + pthread_join(wiki_tid, NULL); + pthread_join(calc_tid, NULL); + + char ***infobox_matrix = NULL; + int *infobox_inner_counts = NULL; + int infobox_count = 0; + + if (calc_data.success) { + infobox_count = + add_infobox_to_collection(&calc_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } + + if (wiki_data.success) { + infobox_count = + add_infobox_to_collection(&wiki_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } + + if (infobox_count > 0) { + context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix, + infobox_count, infobox_inner_counts); + free(infobox_matrix); + free(infobox_inner_counts); + } else { + context_set_array_of_arrays(&ctx, "infoboxes", NULL, 0, NULL); + } + + int total_results = 0; + for (int i = 0; i < ENGINE_COUNT; i++) { + pthread_join(engine_tids[i], NULL); + total_results += engine_data[i].count; + } + + if (total_results > 0) { + char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results); + int *results_inner_counts = (int *)malloc(sizeof(int) * total_results); + char **seen_urls = (char **)malloc(sizeof(char *) * total_results); + int unique_count = 0; + + for (int i = 0; i < ENGINE_COUNT; i++) { + for (int j = 0; j < engine_data[i].count; j++) { + char *raw_url = engine_data[i].results[j].url; + char *clean_url = unescape_search_url(raw_url); + char *display_url = clean_url ? clean_url : raw_url; + + int is_duplicate = 0; + for (int k = 0; k < unique_count; k++) { + if (strcmp(seen_urls[k], display_url) == 0) { + is_duplicate = 1; + break; + } + } + + if (is_duplicate) { + if (clean_url) free(clean_url); + free(engine_data[i].results[j].url); + free(engine_data[i].results[j].title); + free(engine_data[i].results[j].snippet); + continue; + } + + seen_urls[unique_count] = strdup(display_url); + results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4); + char *pretty_url = pretty_display_url(display_url); + + results_matrix[unique_count][0] = strdup(display_url); + results_matrix[unique_count][1] = strdup(pretty_url); + results_matrix[unique_count][2] = + engine_data[i].results[j].title + ? strdup(engine_data[i].results[j].title) + : strdup("Untitled"); + results_matrix[unique_count][3] = + engine_data[i].results[j].snippet + ? strdup(engine_data[i].results[j].snippet) + : strdup(""); + + results_inner_counts[unique_count] = 4; + + free(pretty_url); + free(engine_data[i].results[j].url); + free(engine_data[i].results[j].title); + free(engine_data[i].results[j].snippet); + if (clean_url) free(clean_url); + + unique_count++; + } + free(engine_data[i].results); + } + + context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, + results_inner_counts); + + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } + + for (int i = 0; i < unique_count; i++) { + for (int j = 0; j < 4; j++) free(results_matrix[i][j]); + free(results_matrix[i]); + free(seen_urls[i]); + } + free(seen_urls); + free(results_matrix); + free(results_inner_counts); + } else { + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } + } + + if (wiki_data.success) { + free_infobox(&wiki_data.result); + } + + if (calc_data.success) { + free_infobox(&calc_data.result); + } + + free(encoded_query); + free(display_query); + free_context(&ctx); + + return 0; +} diff --git a/src/Routes/Search.h b/src/Routes/Search.h new file mode 100644 index 0000000..c6bc146 --- /dev/null +++ b/src/Routes/Search.h @@ -0,0 +1,8 @@ +#ifndef SEARCH_HANDLER_H +#define SEARCH_HANDLER_H + +#include <beaker.h> + +int results_handler(UrlParams *params); + +#endif diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c new file mode 100644 index 0000000..d2afea6 --- /dev/null +++ b/src/Scraping/Scraping.c @@ -0,0 +1,468 @@ +#include "Scraping.h" +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +typedef struct { + char *memory; + size_t size; +} MemoryBuffer; + +static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + MemoryBuffer *mem = (MemoryBuffer *)userp; + + char *ptr = (char *)realloc(mem->memory, mem->size + realsize + 1); + if (ptr == NULL) { + LOG_ERROR("Not enough memory (realloc returned NULL)"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static const char *get_random_user_agent() { + static const char *agents[] = { + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " + "Gecko) " + "Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " + "Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; + return agents[rand() % 5]; +} + +static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *link_xpath = "//a[@class='result-link']"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No results found with XPath: %s", engine_name, link_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_links = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] XPath matched %d potential result links", engine_name, + num_links); + + int actual_alloc = (num_links < max_results) ? num_links : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_links && found_count < max_results; i++) { + xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; + char *title = (char *)xmlNodeGetContent(linkNode); + char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = linkNode->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippetRow = current->next; + while (snippetRow && + xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) + snippetRow = snippetRow->next; + if (snippetRow) { + xmlXPathContextPtr subCtx = xmlXPathNewContext(doc); + if (subCtx) { + subCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", subCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = + (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); + } + if (sObj) xmlXPathFreeObject(sObj); + xmlXPathFreeContext(subCtx); + } + } + } + + (*out_results)[found_count].url = strdup(url ? url : ""); + (*out_results)[found_count].title = strdup(title ? title : "No Title"); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + (*out_results)[found_count].title); + found_count++; + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_startpage(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'result')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No result containers found with XPath: %s", engine_name, + container_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] Found %d result containers", engine_name, num_results); + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); + if (!resCtx) { + LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, + i); + continue; + } + resCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//a[contains(@class, 'result-link')]", resCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", resCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//p[contains(@class, 'description')]", resCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + title); + found_count++; + } else { + LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", + engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No"); + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + xmlXPathFreeContext(resCtx); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_yahoo(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + LOG_DEBUG("[%s] Starting XPath parsing...", engine_name); + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + LOG_ERROR("[%s] Could not create XPath context", engine_name); + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'algo-sr')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + LOG_WARN("[%s] No result containers found with XPath: %s", engine_name, + container_xpath); + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + LOG_INFO("[%s] Found %d result containers", engine_name, num_results); + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + LOG_ERROR("[%s] Failed to allocate memory for results", engine_name); + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xmlXPathContextPtr resCtx = xmlXPathNewContext(doc); + if (!resCtx) { + LOG_ERROR("[%s] Failed to create result context for item %d", engine_name, + i); + continue; + } + resCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", + resCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h3[contains(@class, 'title')]", resCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compText')]//p", resCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (!url || !title) { + LOG_DEBUG("[%s] Container %d debug - URL: %s, Title: %s", engine_name, i, + url ? url : "(null)", title ? title : "(null)"); + } + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + LOG_DEBUG("[%s] Parsed Result %d: %s", engine_name, found_count + 1, + title); + found_count++; + } else { + LOG_WARN("[%s] Container %d missed URL or Title. URL: %s, Title: %s", + engine_name, i, url ? "Yes" : "No", title ? "Yes" : "No"); + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + xmlXPathFreeContext(resCtx); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +const SearchEngine ENGINE_REGISTRY[] = { + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .parser = parse_yahoo}}; + +const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results) { + CURL *curl; + MemoryBuffer chunk = {.memory = (char *)malloc(1), .size = 0}; + int results_count = 0; + + LOG_INFO("--- Starting scrape for engine: %s ---", engine->name); + LOG_INFO("[%s] Query: '%s'", engine->name, query); + + if (!chunk.memory) { + LOG_ERROR("Initial memory allocation failed"); + return -1; + } + + curl = curl_easy_init(); + + if (curl && query) { + char full_url[1024]; + char *encoded_query = curl_easy_escape(curl, query, 0); + if (!encoded_query) { + LOG_ERROR("[%s] Failed to encode query", engine->name); + curl_easy_cleanup(curl); + free(chunk.memory); + return -1; + } + snprintf(full_url, sizeof(full_url), "%s%s", engine->base_url, + encoded_query); + curl_free(encoded_query); + + LOG_DEBUG("[%s] Requesting URL: %s", engine->name, full_url); + + struct curl_slist *headers = NULL; + char host_buf[256], ref_buf[256]; + snprintf(host_buf, sizeof(host_buf), "Host: %s", engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", engine->referer); + + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append(headers, + "Accept: " + "text/html,application/xhtml+xml,application/" + "xml;q=0.9,image/avif,image/webp,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); + headers = curl_slist_append(headers, "Upgrade-Insecure-Requests: 1"); + headers = curl_slist_append(headers, "Sec-Fetch-Dest: document"); + headers = curl_slist_append(headers, "Sec-Fetch-Mode: navigate"); + headers = curl_slist_append(headers, "Sec-Fetch-Site: same-origin"); + headers = curl_slist_append(headers, "Connection: keep-alive"); + + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)&chunk); + + const char *ua = get_random_user_agent(); + LOG_DEBUG("[%s] Using User-Agent: %s", engine->name, ua); + curl_easy_setopt(curl, CURLOPT_USERAGENT, ua); + + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); + + LOG_DEBUG("[%s] Waiting for rate-limit jitter...", engine->name); + usleep(500000 + (rand() % 1000000)); + + CURLcode res = curl_easy_perform(curl); + + if (res != CURLE_OK) { + LOG_ERROR("[%s] libcurl error: %s", engine->name, + curl_easy_strerror(res)); + } else { + long response_code; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + LOG_INFO("[%s] HTTP Response Code: %ld", engine->name, response_code); + + if (chunk.size > 0) { + xmlDocPtr doc = htmlReadMemory( + chunk.memory, chunk.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + if (doc) { + results_count = + engine->parser(engine->name, doc, out_results, max_results); + xmlFreeDoc(doc); + } + } + } + + if (results_count <= 0) { + LOG_WARN("[%s] No results found. Generating skeleton fallback.", + engine->name); + *out_results = (SearchResult *)malloc(sizeof(SearchResult)); + if (*out_results) { + char fallback_msg[512]; + snprintf(fallback_msg, sizeof(fallback_msg), + "Search %s manually for '%s'", engine->name, query); + + (*out_results)[0].title = strdup(fallback_msg); + (*out_results)[0].url = strdup(full_url); + (*out_results)[0].snippet = strdup( + "Automated results were blocked by a Captcha or anti-bot " + "challenge. Click the link above to perform the search " + "manually in your browser."); + results_count = 1; + } + } + + curl_slist_free_all(headers); + curl_easy_cleanup(curl); + } else { + if (curl) { + curl_easy_cleanup(curl); + } + } + + free(chunk.memory); + + return results_count; +} diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h new file mode 100644 index 0000000..7ad4d59 --- /dev/null +++ b/src/Scraping/Scraping.h @@ -0,0 +1,34 @@ +#ifndef SCRAPING_H +#define SCRAPING_H + +#include <libxml/HTMLparser.h> + +#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) +#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) +#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) +#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) + +typedef struct { + char *url; + char *title; + char *snippet; +} SearchResult; + +typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results); + +typedef struct { + const char *name; + const char *base_url; + const char *host_header; + const char *referer; + ParserFunc parser; +} SearchEngine; + +extern const SearchEngine ENGINE_REGISTRY[]; +extern const int ENGINE_COUNT; + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results); + +#endif diff --git a/src/Utility/Display.c b/src/Utility/Display.c new file mode 100644 index 0000000..492e998 --- /dev/null +++ b/src/Utility/Display.c @@ -0,0 +1,46 @@ +#include "Display.h" +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> + +char *pretty_display_url(const char *input) { + if (!input) return NULL; + + const char *start = input; + + const char *protocol_pos = strstr(input, "://"); + if (protocol_pos) { + start = protocol_pos + 3; + } + + if (strncasecmp(start, "www.", 4) == 0) { + start += 4; + } + + size_t input_len = strlen(start); + char temp[512]; + strncpy(temp, start, sizeof(temp) - 1); + temp[sizeof(temp) - 1] = '\0'; + + if (input_len > 0 && temp[input_len - 1] == '/') { + temp[input_len - 1] = '\0'; + } + + char *output = (char *)malloc(strlen(temp) * 3 + 1); + if (!output) return NULL; + + size_t j = 0; + for (size_t i = 0; temp[i] != '\0'; i++) { + if (temp[i] == '/') { + output[j++] = ' '; + output[j++] = '>'; + output[j++] = ' '; + } else { + output[j++] = (char)tolower((unsigned char)temp[i]); + } + } + output[j] = '\0'; + + return output; +} diff --git a/src/Utility/Display.h b/src/Utility/Display.h new file mode 100644 index 0000000..bbaf421 --- /dev/null +++ b/src/Utility/Display.h @@ -0,0 +1,6 @@ +#ifndef DISPLAY_H +#define DISPLAY_H + +char *pretty_display_url(const char *input); + +#endif diff --git a/src/Utility/Unescape.c b/src/Utility/Unescape.c new file mode 100644 index 0000000..e2968b2 --- /dev/null +++ b/src/Utility/Unescape.c @@ -0,0 +1,80 @@ +#include "Unescape.h" +#include "Utility.h" +#include <stdlib.h> +#include <string.h> + +char *unescape_search_url(const char *input) { + if (!input) return NULL; + + const char *key = NULL; + const char *start = NULL; + const char *end = NULL; + size_t len = 0; + + if (strstr(input, "uddg=")) { + key = "uddg="; + start = strstr(input, key); + if (!start) return NULL; + start += strlen(key); + end = strchr(start, '&'); + len = end ? (size_t)(end - start) : strlen(start); + } + + else if (strstr(input, "RU=")) { + key = "RU="; + start = strstr(input, key); + if (!start) return strdup(input); + start += strlen(key); + end = strchr(start, '/'); + len = end ? (size_t)(end - start) : strlen(start); + } + + else { + return strdup(input); + } + + char *output = (char *)malloc(len * 3 + 1); + if (!output) return NULL; + + size_t i = 0, j = 0; + while (i < len) { + if (start[i] == '%' && i + 2 < len) { + int high = hex_to_int(start[i + 1]); + int low = hex_to_int(start[i + 2]); + if (high != -1 && low != -1) { + output[j++] = (char)((high << 4) | low); + i += 3; + } else { + output[j++] = start[i++]; + } + } else if (start[i] == '+') { + output[j++] = ' '; + i++; + } else { + output[j++] = start[i++]; + } + } + output[j] = '\0'; + + return output; +} + +char *url_decode_query(const char *src) { + if (!src) return NULL; + char *res = strdup(src); + char *p = res; + while (*src) { + if (*src == '+') { + *p++ = ' '; + } else if (*src == '%' && src[1] && src[2]) { + char hex[3] = {src[1], src[2], '\0'}; + *p++ = (char)strtol(hex, NULL, 16); + src += 2; + } else { + *p++ = *src; + } + src++; + } + *p = '\0'; + return res; +} diff --git a/src/Utility/Unescape.h b/src/Utility/Unescape.h new file mode 100644 index 0000000..0adb228 --- /dev/null +++ b/src/Utility/Unescape.h @@ -0,0 +1,10 @@ +#ifndef UNESCAPE_H +#define UNESCAPE_H + +#include <stddef.h> + +char *unescape_search_url(const char *input); +char *url_decode_query(const char *src); + +#endif + diff --git a/src/Utility/Utility.c b/src/Utility/Utility.c new file mode 100644 index 0000000..8e5af92 --- /dev/null +++ b/src/Utility/Utility.c @@ -0,0 +1,8 @@ +#include "Utility.h" + +int hex_to_int(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} diff --git a/src/Utility/Utility.h b/src/Utility/Utility.h new file mode 100644 index 0000000..3b0181c --- /dev/null +++ b/src/Utility/Utility.h @@ -0,0 +1,6 @@ +#ifndef UTILITY_H +#define UTILITY_H + +int hex_to_int(char c); + +#endif |
