diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Config.c | 67 | ||||
| -rw-r--r-- | src/Config.h | 11 | ||||
| -rw-r--r-- | src/Infobox/Calculator.c | 115 | ||||
| -rw-r--r-- | src/Infobox/Calculator.h | 9 | ||||
| -rw-r--r-- | src/Infobox/Dictionary.c | 246 | ||||
| -rw-r--r-- | src/Infobox/Dictionary.h | 10 | ||||
| -rw-r--r-- | src/Infobox/Infobox.c | 13 | ||||
| -rw-r--r-- | src/Infobox/Infobox.h | 13 | ||||
| -rw-r--r-- | src/Infobox/Wikipedia.c | 165 | ||||
| -rw-r--r-- | src/Infobox/Wikipedia.h | 9 | ||||
| -rw-r--r-- | src/Main.c | 49 | ||||
| -rw-r--r-- | src/Routes/Home.c | 14 | ||||
| -rw-r--r-- | src/Routes/Home.h | 8 | ||||
| -rw-r--r-- | src/Routes/Images.c | 278 | ||||
| -rw-r--r-- | src/Routes/Images.h | 8 | ||||
| -rw-r--r-- | src/Routes/Search.c | 275 | ||||
| -rw-r--r-- | src/Routes/Search.h | 8 | ||||
| -rw-r--r-- | src/Scraping/Scraping.c | 459 | ||||
| -rw-r--r-- | src/Scraping/Scraping.h | 58 | ||||
| -rw-r--r-- | src/Utility/Display.c | 46 | ||||
| -rw-r--r-- | src/Utility/Display.h | 6 | ||||
| -rw-r--r-- | src/Utility/Unescape.c | 80 | ||||
| -rw-r--r-- | src/Utility/Unescape.h | 10 | ||||
| -rw-r--r-- | src/Utility/Utility.c | 8 | ||||
| -rw-r--r-- | src/Utility/Utility.h | 6 |
25 files changed, 1971 insertions, 0 deletions
diff --git a/src/Config.c b/src/Config.c new file mode 100644 index 0000000..4a93980 --- /dev/null +++ b/src/Config.c @@ -0,0 +1,67 @@ +#include "Config.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int load_config(const char *filename, Config *config) { + FILE *file = fopen(filename, "r"); + if (!file) { + return -1; + } + + char line[512]; + char section[64] = ""; + + while (fgets(line, sizeof(line), file)) { + + line[strcspn(line, "\r\n")] = 0; + + if (line[0] == '\0' || line[0] == '#' || line[0] == ';') { + continue; + } + + if (line[0] == '[') { + char *end = strchr(line, ']'); + if (end) { + *end = '\0'; + snprintf(section, sizeof(section), "%.*s", (int)(sizeof(section) - 1), line + 1); + section[sizeof(section) - 1] = '\0'; + } + continue; + } + + char *delimiter = strchr(line, '='); + if (delimiter) { + *delimiter = '\0'; + char *key = line; + char *value = delimiter + 1; + + while (*key == ' ' || *key == '\t') key++; + while (*value == ' ' || *value == '\t') value++; + + char *key_end = key + strlen(key) - 1; + while (key_end > key && (*key_end == ' ' || *key_end == '\t')) { + *key_end = '\0'; + key_end--; + } + + char *value_end = value + strlen(value) - 1; + while (value_end > value && (*value_end == ' ' || *value_end == '\t')) { + *value_end = '\0'; + value_end--; + } + + if (strcmp(section, "server") == 0) { + if (strcmp(key, "host") == 0) { + strncpy(config->host, value, sizeof(config->host) - 1); + config->host[sizeof(config->host) - 1] = '\0'; + } else if (strcmp(key, "port") == 0) { + config->port = atoi(value); + } + } + } + } + + fclose(file); + return 0; +}
\ No newline at end of file diff --git a/src/Config.h b/src/Config.h new file mode 100644 index 0000000..384ed94 --- /dev/null +++ b/src/Config.h @@ -0,0 +1,11 @@ +#ifndef CONFIG_H +#define CONFIG_H + +typedef struct { + char host[256]; + int port; +} Config; + +int load_config(const char *filename, Config *config); + +#endif
\ No newline at end of file diff --git a/src/Infobox/Calculator.c b/src/Infobox/Calculator.c new file mode 100644 index 0000000..b80ce21 --- /dev/null +++ b/src/Infobox/Calculator.c @@ -0,0 +1,115 @@ +#include "Calculator.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <ctype.h> + +static char logic_log[4096]; + +typedef struct { + const char *buffer; + int pos; +} Parser; + +static double parse_expression(Parser *p); + +static void skip_ws(Parser *p) { + while (p->buffer[p->pos] == ' ') p->pos++; +} + +static double parse_factor(Parser *p) { + skip_ws(p); + if (p->buffer[p->pos] == '-') { + p->pos++; + return -parse_factor(p); + } + if (p->buffer[p->pos] == '(') { + p->pos++; + double res = parse_expression(p); + if (p->buffer[p->pos] == ')') p->pos++; + return res; + } + char *endptr; + double val = strtod(&p->buffer[p->pos], &endptr); + p->pos = (int)(endptr - p->buffer); + return val; +} + +static double parse_term(Parser *p) { + double left = parse_factor(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '*' || op == '/') { + p->pos++; + double right = parse_factor(p); + double old = left; + left = (op == '*') ? left * right : left / right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; +} + +static double parse_expression(Parser *p) { + double left = parse_term(p); + while (1) { + skip_ws(p); + char op = p->buffer[p->pos]; + if (op == '+' || op == '-') { + p->pos++; + double right = parse_term(p); + double old = left; + left = (op == '+') ? left + right : left - right; + + char step[256]; + + snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op, + right, left); + strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1); + } else + break; + } + return left; +} + +double evaluate(const char *expr) { + logic_log[0] = '\0'; + if (!expr || strlen(expr) == 0) return 0.0; + Parser p = {expr, 0}; + return parse_expression(&p); +} + +InfoBox fetch_calc_data(char *math_input) { + InfoBox info = {NULL, NULL, NULL, NULL}; + if (!math_input) return info; + + double result = evaluate(math_input); + + char html_output[5120]; + snprintf(html_output, sizeof(html_output), + "<div class='calc-container' style='line-height: 1.6;'>" + "%s" + "<div style='margin-top: 8px; border-top: 1px solid #eee; " + "padding-top: 8px; font-size: 1.2em;'>" + "<b>%g</b>" + "</div>" + "</div>", + strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>", + result); + + info.title = strdup("Calculation"); + info.extract = strdup(html_output); + info.thumbnail_url = + strdup("/static/calculation.svg"); + info.url = strdup("#"); + + return info; +} diff --git a/src/Infobox/Calculator.h b/src/Infobox/Calculator.h new file mode 100644 index 0000000..275aed6 --- /dev/null +++ b/src/Infobox/Calculator.h @@ -0,0 +1,9 @@ +#ifndef CALCULATOR_H +#define CALCULATOR_H + +#include "Infobox.h" + +double evaluate(const char *expr); +InfoBox fetch_calc_data(char *math_input); + +#endif diff --git a/src/Infobox/Dictionary.c b/src/Infobox/Dictionary.c new file mode 100644 index 0000000..a835899 --- /dev/null +++ b/src/Infobox/Dictionary.c @@ -0,0 +1,246 @@ +#include "Dictionary.h" +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <ctype.h> + +static const char *PREFIXES[] = { + "what is the definition of ", "what's the definition of ", + "what is the meaning of ", "what's the meaning of ", + "what does the word ", "definition of ", "meaning of ", "def of ", + "define ", "definition ", "define:", "def ", "def:", + "what does ", "what is ", "what's ", "whats ", + "meaning ", "dictionary ", "dict ", NULL +}; + +static const char *SUFFIXES[] = { + " definition", " def", " meaning", " mean", " means", + " dictionary", " dict", " define", " defined", + " definition?", " def?", " meaning?", " mean?", " means?", + " in english", " in english?", NULL +}; + +static const char *SKIP_WORDS[] = {"of ", "the ", "a ", "an ", NULL}; + +static const char *strcasestr_impl(const char *haystack, const char *needle) { + if (!haystack || !needle || !*needle) return haystack; + size_t len = strlen(needle); + for (const char *h = haystack; *h; h++) { + if (strncasecmp(h, needle, len) == 0) return h; + } + return NULL; +} + +struct MemStruct { char *memory; size_t size; }; + +static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t realsize = size * nmemb; + struct MemStruct *mem = (struct MemStruct *)userp; + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (!ptr) return 0; + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + return realsize; +} + +static char *xpath_text(xmlDocPtr doc, const char *xpath) { + xmlXPathContextPtr ctx = xmlXPathNewContext(doc); + if (!ctx) return NULL; + xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx); + xmlXPathFreeContext(ctx); + if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) { + if (obj) xmlXPathFreeObject(obj); + return NULL; + } + xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]); + char *result = content ? strdup((char *)content) : NULL; + if (content) xmlFree(content); + xmlXPathFreeObject(obj); + return result; +} + +static char *build_html(const char *word, const char *pron, const char *pos, + const char *def, const char *ex) { + char html[4096]; + int n = snprintf(html, sizeof(html), "<div class='dict-container' style='line-height: 1.6;'>"); + if (word) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-size: 1.3em; font-weight: bold; margin-bottom: 4px;'>%s</div>", word); + if (pron) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #666; margin-bottom: 8px;'>/%s/</div>", pron); + if (pos) n += snprintf(html + n, sizeof(html) - n, + "<div style='font-style: italic; color: #888; margin-bottom: 8px;'>%s</div>", pos); + if (def) n += snprintf(html + n, sizeof(html) - n, + "<div style='margin-bottom: 8px;'>%s</div>", def); + if (ex) n += snprintf(html + n, sizeof(html) - n, + "<div style='color: #555; font-style: italic; margin-top: 8px;'>\"%s\"</div>", ex); + snprintf(html + n, sizeof(html) - n, "</div>"); + return strdup(html); +} + +static char *extract_word(const char *query) { + if (!query) return NULL; + + const char *start = query; + + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(start, PREFIXES[i], len) == 0) { + start += len; + break; + } + } + + while (*start == ' ') start++; + char *word = strdup(start); + if (!word) return NULL; + + int changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SKIP_WORDS[i]; i++) { + size_t len = strlen(SKIP_WORDS[i]); + if (strncasecmp(word, SKIP_WORDS[i], len) == 0) { + memmove(word, word + len, strlen(word + len) + 1); + changed = 1; + break; + } + } + } + + changed = 1; + while (changed) { + changed = 0; + for (int i = 0; SUFFIXES[i]; i++) { + const char *found = strcasestr_impl(word, SUFFIXES[i]); + if (found) { + char *pos = word + (found - word); + *pos = '\0'; + changed = 1; + break; + } + } + } + + size_t len = strlen(word); + while (len > 0 && (word[len-1] == ' ' || word[len-1] == '?' || + word[len-1] == '!' || word[len-1] == '.')) { + word[--len] = '\0'; + } + + if (len == 0) { free(word); return NULL; } + + for (size_t i = 0; i < len; i++) word[i] = tolower((unsigned char)word[i]); + char *space = strchr(word, ' '); + if (space) *space = '\0'; + + return word; +} + +int is_dictionary_query(const char *query) { + if (!query) return 0; + + for (int i = 0; PREFIXES[i]; i++) { + size_t len = strlen(PREFIXES[i]); + if (strncasecmp(query, PREFIXES[i], len) == 0) { + const char *after = query + len; + while (*after == ' ') after++; + if (*after != '\0') return 1; + } + } + + for (int i = 0; SUFFIXES[i]; i++) { + const char *pos = strcasestr_impl(query, SUFFIXES[i]); + if (pos) { + const char *after = pos + strlen(SUFFIXES[i]); + while (*after == ' ' || *after == '?' || *after == '!' || *after == '.') after++; + if (*after == '\0' && pos > query && (pos - query) < 100) return 1; + } + } + + if (strncasecmp(query, "what is ", 8) == 0 || + strncasecmp(query, "what's ", 7) == 0 || + strncasecmp(query, "whats ", 6) == 0) { + const char *word = query + (strncasecmp(query, "what is ", 8) == 0 ? 8 : + strncasecmp(query, "what's ", 7) == 0 ? 7 : 6); + const char *articles[] = {"the ", "your ", "my ", "his ", "her ", "their ", + "our ", "this ", "that ", "these ", "those ", "a ", "an ", NULL}; + for (int i = 0; articles[i]; i++) { + if (strncasecmp(word, articles[i], strlen(articles[i])) == 0) return 0; + } + const char *space = strchr(word, ' '); + if (!space || *(space + 1) == '\0' || *(space + 1) == '?') return 1; + } + + return 0; +} + +char *construct_dictionary_url(const char *query) { + char *word = extract_word(query); + if (!word) return NULL; + + CURL *curl = curl_easy_init(); + if (!curl) { free(word); return NULL; } + + char *escaped = curl_easy_escape(curl, word, 0); + const char *base = "https://dictionary.cambridge.org/dictionary/english/"; + char *url = malloc(strlen(base) + strlen(escaped) + 1); + if (url) { + strcpy(url, base); + strcat(url, escaped); + } + + curl_free(escaped); + curl_easy_cleanup(curl); + free(word); + return url; +} + +InfoBox fetch_dictionary_data(const char *query) { + InfoBox info = {NULL, NULL, NULL, NULL}; + + char *url = construct_dictionary_url(query); + if (!url) return info; + + CURL *curl = curl_easy_init(); + if (!curl) { free(url); return info; } + + struct MemStruct chunk = {malloc(1), 0}; + curl_easy_setopt(curl, CURLOPT_URL, url); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + + if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) { + htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + if (doc) { + char *word = xpath_text(doc, "//span[@class='hw dhw']"); + char *pron = xpath_text(doc, "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']"); + char *pos = xpath_text(doc, "//span[@class='pos dpos']"); + char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]"); + char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]"); + + if (word && def) { + info.title = strdup("Dictionary"); + info.extract = build_html(word, pron, pos, def, ex); + info.thumbnail_url = strdup("/static/dictionary.jpg"); + info.url = strdup(url); + } + + free(word); free(pron); free(pos); free(def); free(ex); + xmlFreeDoc(doc); + } + } + + curl_easy_cleanup(curl); + free(chunk.memory); + free(url); + return info; +}
\ No newline at end of file diff --git a/src/Infobox/Dictionary.h b/src/Infobox/Dictionary.h new file mode 100644 index 0000000..2f212c3 --- /dev/null +++ b/src/Infobox/Dictionary.h @@ -0,0 +1,10 @@ +#ifndef DICTIONARY_H +#define DICTIONARY_H + +#include "Infobox.h" + +InfoBox fetch_dictionary_data(const char *word); +char *construct_dictionary_url(const char *word); +int is_dictionary_query(const char *query); + +#endif
\ No newline at end of file diff --git a/src/Infobox/Infobox.c b/src/Infobox/Infobox.c new file mode 100644 index 0000000..5043c05 --- /dev/null +++ b/src/Infobox/Infobox.c @@ -0,0 +1,13 @@ +#include "Infobox.h" +#include <stdlib.h> + +void free_infobox(InfoBox *info) { + if (info->title) + free(info->title); + if (info->thumbnail_url) + free(info->thumbnail_url); + if (info->extract) + free(info->extract); + if (info->url) + free(info->url); +} diff --git a/src/Infobox/Infobox.h b/src/Infobox/Infobox.h new file mode 100644 index 0000000..a052b80 --- /dev/null +++ b/src/Infobox/Infobox.h @@ -0,0 +1,13 @@ +#ifndef INFOBOX_H +#define INFOBOX_H + +typedef struct { + char *title; + char *thumbnail_url; + char *extract; + char *url; +} InfoBox; + +void free_infobox(InfoBox *info); + +#endif diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c new file mode 100644 index 0000000..ed4645f --- /dev/null +++ b/src/Infobox/Wikipedia.c @@ -0,0 +1,165 @@ +#include "Wikipedia.h" +#include <curl/curl.h> +#include <libxml/parser.h> +#include <libxml/tree.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct WikiMemoryStruct { + char *memory; + size_t size; +}; + +static void shorten_summary(char **extract_ptr, int max_chars) { + if (!extract_ptr || !*extract_ptr) return; + + char *text = *extract_ptr; + int len = strlen(text); + + if (len <= max_chars) return; + + int end_pos = max_chars; + for (int i = max_chars; i > (max_chars / 2); i--) { + if (text[i] == '.' || text[i] == '!' || text[i] == '?') { + end_pos = i + 1; + break; + } + } + + char *new_text = (char *)malloc(end_pos + 4); + + if (new_text) { + strncpy(new_text, text, end_pos); + new_text[end_pos] = '\0'; + strcat(new_text, "..."); + free(*extract_ptr); + *extract_ptr = new_text; + } +} + +static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp; + + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (ptr == NULL) { + fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static void extract_wiki_info(xmlNode *node, InfoBox *info) { + xmlNode *cur_node = NULL; + + for (cur_node = node; cur_node; cur_node = cur_node->next) { + if (cur_node->type == XML_ELEMENT_NODE) { + if (strcmp((const char *)cur_node->name, "page") == 0) { + xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title"); + if (title) { + info->title = strdup((const char *)title); + + const char *base_article_url = "https://en.wikipedia.org/wiki/"; + char *formatted_title = strdup((const char *)title); + for (int i = 0; formatted_title[i]; i++) { + if (formatted_title[i] == ' ') formatted_title[i] = '_'; + } + + info->url = + malloc(strlen(base_article_url) + strlen(formatted_title) + 1); + if (info->url) { + strcpy(info->url, base_article_url); + strcat(info->url, formatted_title); + } + free(formatted_title); + xmlFree(title); + } + } + + if (strcmp((const char *)cur_node->name, "thumbnail") == 0) { + xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source"); + if (source) { + info->thumbnail_url = strdup((const char *)source); + xmlFree(source); + } + } + + if (strcmp((const char *)cur_node->name, "extract") == 0) { + xmlChar *content = xmlNodeGetContent(cur_node); + if (content) { + info->extract = strdup((const char *)content); + + shorten_summary(&(info->extract), 300); + xmlFree(content); + } + } + } + extract_wiki_info(cur_node->children, info); + } +} + +InfoBox fetch_wiki_data(char *api_url) { + CURL *curl_handle; + CURLcode res; + struct WikiMemoryStruct chunk; + InfoBox info = {NULL, NULL, NULL, NULL}; + + chunk.memory = malloc(1); + chunk.size = 0; + + curl_handle = curl_easy_init(); + + if (curl_handle) { + curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, + WikiWriteMemoryCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + + res = curl_easy_perform(curl_handle); + + if (res == CURLE_OK) { + xmlDocPtr doc = + xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); + } + } + + curl_easy_cleanup(curl_handle); + free(chunk.memory); + } + + return info; +} + +char *construct_wiki_url(const char *search_term) { + CURL *curl = curl_easy_init(); + if (!curl) return NULL; + + char *escaped_term = curl_easy_escape(curl, search_term, 0); + const char *base = + "https://en.wikipedia.org/w/" + "api.php?action=query&prop=extracts|pageimages&exintro&" + "explaintext&pithumbsize=400&format=xml&origin=*&titles="; + + char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1); + if (full_url) { + strcpy(full_url, base); + strcat(full_url, escaped_term); + } + + curl_free(escaped_term); + curl_easy_cleanup(curl); + return full_url; +} diff --git a/src/Infobox/Wikipedia.h b/src/Infobox/Wikipedia.h new file mode 100644 index 0000000..8a8103e --- /dev/null +++ b/src/Infobox/Wikipedia.h @@ -0,0 +1,9 @@ +#ifndef WIKIPEDIA_H +#define WIKIPEDIA_H + +#include "Infobox.h" + +InfoBox fetch_wiki_data(char *api_url); +char *construct_wiki_url(const char *search_term); + +#endif diff --git a/src/Main.c b/src/Main.c new file mode 100644 index 0000000..d1b2eb9 --- /dev/null +++ b/src/Main.c @@ -0,0 +1,49 @@ +#include <beaker.h> +#include <curl/curl.h> +#include <libxml/parser.h> +#include <stdio.h> +#include <stdlib.h> + +#include "Config.h" +#include "Routes/Home.h" +#include "Routes/Images.h" +#include "Routes/Search.h" + +int handle_opensearch(UrlParams *params) { + (void)params; + serve_static_file_with_mime("opensearch.xml", "application/opensearchdescription+xml"); + return 0; +} + +int main() { + LIBXML_TEST_VERSION + xmlInitParser(); + + curl_global_init(CURL_GLOBAL_DEFAULT); + + Config config = {.host = "0.0.0.0", .port = 5000}; + + if (load_config("config.ini", &config) != 0) { + fprintf(stderr, "Warning: Could not load config file, using defaults\n"); + } + + set_handler("/", home_handler); + set_handler("/opensearch.xml", handle_opensearch); + set_handler("/search", results_handler); + set_handler("/images", images_handler); + + fprintf(stderr, "Starting Omnisearch on %s:%d\n", config.host, config.port); + + int result = beaker_run(config.host, config.port); + + if (result != 0) { + fprintf(stderr, "Error: Beaker server failed to start.\n"); + curl_global_cleanup(); + xmlCleanupParser(); + return EXIT_FAILURE; + } + + curl_global_cleanup(); + xmlCleanupParser(); + return EXIT_SUCCESS; +}
\ No newline at end of file diff --git a/src/Routes/Home.c b/src/Routes/Home.c new file mode 100644 index 0000000..81370ba --- /dev/null +++ b/src/Routes/Home.c @@ -0,0 +1,14 @@ +#include "Home.h" +#include <stdlib.h> + +int home_handler(UrlParams *params) { + (void)params; + TemplateContext ctx = new_context(); + char *rendered_html = render_template("home.html", &ctx); + send_response(rendered_html); + + free(rendered_html); + free_context(&ctx); + + return 0; +} diff --git a/src/Routes/Home.h b/src/Routes/Home.h new file mode 100644 index 0000000..5d01ab3 --- /dev/null +++ b/src/Routes/Home.h @@ -0,0 +1,8 @@ +#ifndef HOME_H +#define HOME_H + +#include <beaker.h> + +int home_handler(UrlParams *params); + +#endif diff --git a/src/Routes/Images.c b/src/Routes/Images.c new file mode 100644 index 0000000..67ae94c --- /dev/null +++ b/src/Routes/Images.c @@ -0,0 +1,278 @@ +#include "Images.h" +#include "../Utility/Unescape.h" + +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +struct MemoryBlock { + char *response; + size_t size; +}; + +static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + struct MemoryBlock *mem = (struct MemoryBlock *)userp; + char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1); + if (ptr == NULL) { + return 0; + } + mem->response = ptr; + memcpy(&(mem->response[mem->size]), data, realsize); + mem->size += realsize; + mem->response[mem->size] = 0; + return realsize; +} + +static char *fetch_images_html(const char *url) { + CURL *curl_handle; + struct MemoryBlock chunk = {.response = malloc(1), .size = 0}; + if (!chunk.response) { + return NULL; + } + + curl_handle = curl_easy_init(); + if (!curl_handle) { + free(chunk.response); + return NULL; + } + + curl_easy_setopt(curl_handle, CURLOPT_URL, url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt( + curl_handle, CURLOPT_USERAGENT, + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko"); + curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L); + + CURLcode res = curl_easy_perform(curl_handle); + if (res != CURLE_OK) { + free(chunk.response); + curl_easy_cleanup(curl_handle); + return NULL; + } + + curl_easy_cleanup(curl_handle); + return chunk.response; +} + +int images_handler(UrlParams *params) { + TemplateContext ctx = new_context(); + char *raw_query = ""; + + if (params) { + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + break; + } + } + } + + char *display_query = url_decode_query(raw_query); + context_set(&ctx, "query", display_query); + + if (!raw_query || strlen(raw_query) == 0) { + send_response("<h1>No query provided</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; + } + + CURL *tmp = curl_easy_init(); + if (!tmp) { + send_response("<h1>Error initializing curl</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; + } + char *encoded_query = curl_easy_escape(tmp, raw_query, 0); + curl_easy_cleanup(tmp); + + if (!encoded_query) { + send_response("<h1>Error encoding query</h1>"); + if (display_query) free(display_query); + free_context(&ctx); + return -1; + } + + char url[1024]; + snprintf(url, sizeof(url), + "https://www.bing.com/images/search?q=%s", encoded_query); + fprintf(stderr, "[DEBUG] Fetching URL: %s\n", url); + + char *html = fetch_images_html(url); + if (!html) { + fprintf(stderr, "[DEBUG] Failed to fetch HTML\n"); + send_response("<h1>Error fetching images</h1>"); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR); + if (!doc) { + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + + if (!xpathCtx) { + xmlFreeDoc(doc); + free(html); + free(encoded_query); + free(display_query); + free_context(&ctx); + return -1; + } + + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx); + + int image_count = 0; + char ***image_matrix = NULL; + int *inner_counts = NULL; + + if (xpathObj && xpathObj->nodesetval) { + int nodes = xpathObj->nodesetval->nodeNr; + fprintf(stderr, "[DEBUG] Found %d image items\n", nodes); + + int max_images = (nodes < 32) ? nodes : 32; + image_matrix = malloc(sizeof(char **) * max_images); + inner_counts = malloc(sizeof(int) * max_images); + + for (int i = 0; i < nodes; i++) { + if (image_count >= 32) break; + + xmlNodePtr node = xpathObj->nodesetval->nodeTab[i]; + xmlNodePtr img_node = NULL; + xmlNodePtr tit_node = NULL; + xmlNodePtr des_node = NULL; + xmlNodePtr thumb_link = NULL; + + for (xmlNodePtr child = node->children; child; child = child->next) { + if (child->type != XML_ELEMENT_NODE) continue; + + if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class) { + if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) { + thumb_link = child; + for (xmlNodePtr thumb_child = child->children; thumb_child; thumb_child = thumb_child->next) { + if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = xmlGetProp(thumb_child, (const xmlChar *)"class"); + if (div_class && xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) { + for (xmlNodePtr cico_child = thumb_child->children; cico_child; cico_child = cico_child->next) { + if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == 0) { + img_node = cico_child; + break; + } + } + } + if (div_class) xmlFree(div_class); + } + } + } else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) { + tit_node = child; + } + xmlFree(class); + } + } else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) { + xmlChar *class = xmlGetProp(child, (const xmlChar *)"class"); + if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) { + for (xmlNodePtr meta_child = child->children; meta_child; meta_child = meta_child->next) { + if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) { + xmlChar *div_class = xmlGetProp(meta_child, (const xmlChar *)"class"); + if (div_class) { + if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) { + des_node = meta_child; + } + xmlFree(div_class); + } + } else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) { + xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class"); + if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) { + tit_node = meta_child; + } + if (a_class) xmlFree(a_class); + } + } + } + if (class) xmlFree(class); + } + } + + xmlChar *iurl = img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL; + xmlChar *full_url = thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL; + xmlChar *title = des_node ? xmlNodeGetContent(des_node) : (tit_node ? xmlNodeGetContent(tit_node) : NULL); + xmlChar *rurl = tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL; + + fprintf(stderr, "[DEBUG] Image %d: thumb=%s, full=%s, title=%s, site=%s\n", + image_count, iurl ? (char *)iurl : "nil", + full_url ? (char *)full_url : "nil", + title ? (char *)title : "nil", + rurl ? (char *)rurl : "nil"); + + if (iurl && strlen((char *)iurl) > 0) { + image_matrix[image_count] = malloc(sizeof(char *) * 4); + image_matrix[image_count][0] = strdup((char *)iurl); + image_matrix[image_count][1] = strdup(title ? (char *)title : "Image"); + image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#"); + image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#"); + inner_counts[image_count] = 4; + image_count++; + } + + if (iurl) xmlFree(iurl); + if (title) xmlFree(title); + if (rurl) xmlFree(rurl); + if (full_url) xmlFree(full_url); + } + } + + context_set_array_of_arrays(&ctx, "images", image_matrix, image_count, + inner_counts); + + char *rendered = render_template("images.html", &ctx); + if (rendered) { + send_response(rendered); + free(rendered); + } else { + send_response("<h1>Error rendering image results</h1>"); + } + + if (image_matrix) { + for (int i = 0; i < image_count; i++) { + for (int j = 0; j < 4; j++) { + free(image_matrix[i][j]); + } + free(image_matrix[i]); + } + free(image_matrix); + } + if (inner_counts) { + free(inner_counts); + } + + if (xpathObj) xmlXPathFreeObject(xpathObj); + if (xpathCtx) xmlXPathFreeContext(xpathCtx); + if (doc) xmlFreeDoc(doc); + free(html); + curl_free(encoded_query); + free(display_query); + free_context(&ctx); + + return 0; +}
\ No newline at end of file diff --git a/src/Routes/Images.h b/src/Routes/Images.h new file mode 100644 index 0000000..86f4a31 --- /dev/null +++ b/src/Routes/Images.h @@ -0,0 +1,8 @@ +#ifndef IMAGES_HANDLER_H +#define IMAGES_HANDLER_H + +#include <beaker.h> + +int images_handler(UrlParams *params); + +#endif diff --git a/src/Routes/Search.c b/src/Routes/Search.c new file mode 100644 index 0000000..4e8c7ad --- /dev/null +++ b/src/Routes/Search.c @@ -0,0 +1,275 @@ +#include "Search.h" +#include "../Infobox/Wikipedia.h" +#include "../Infobox/Calculator.h" +#include "../Infobox/Dictionary.h" +#include "../Scraping/Scraping.h" +#include "../Utility/Display.h" +#include "../Utility/Unescape.h" +#include <ctype.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> + +typedef struct { + const char *query; + InfoBox result; + int success; +} InfoBoxThreadData; + +static void *wiki_thread_func(void *arg) { + InfoBoxThreadData *data = (InfoBoxThreadData *)arg; + char *dynamic_url = construct_wiki_url(data->query); + if (dynamic_url) { + data->result = fetch_wiki_data(dynamic_url); + data->success = + (data->result.title != NULL && data->result.extract != NULL && + strlen(data->result.extract) > 10); + free(dynamic_url); + } else { + data->success = 0; + } + return NULL; +} + +static int is_calculator_query(const char *query) { + if (!query) return 0; + + int has_digit = 0; + int has_operator = 0; + + for (const char *p = query; *p; p++) { + if (isdigit(*p) || *p == '.') { + has_digit = 1; + } + if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '=' || + *p == '^') { + has_operator = 1; + } + } + + return has_digit && (has_operator || strchr(query, '.')); +} + +static void *calc_thread_func(void *arg) { + InfoBoxThreadData *data = (InfoBoxThreadData *)arg; + + if (is_calculator_query(data->query)) { + data->result = fetch_calc_data((char *)data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); + } else { + data->success = 0; + } + + return NULL; +} + +static void *dict_thread_func(void *arg) { + InfoBoxThreadData *data = (InfoBoxThreadData *)arg; + + if (is_dictionary_query(data->query)) { + data->result = fetch_dictionary_data(data->query); + data->success = + (data->result.title != NULL && data->result.extract != NULL); + } else { + data->success = 0; + } + + return NULL; +} + +static int add_infobox_to_collection(InfoBox *infobox, char ****collection, + int **inner_counts, int current_count) { + *collection = + (char ***)realloc(*collection, sizeof(char **) * (current_count + 1)); + *inner_counts = + (int *)realloc(*inner_counts, sizeof(int) * (current_count + 1)); + + (*collection)[current_count] = (char **)malloc(sizeof(char *) * 4); + (*collection)[current_count][0] = infobox->title; + (*collection)[current_count][1] = infobox->thumbnail_url; + (*collection)[current_count][2] = infobox->extract; + (*collection)[current_count][3] = infobox->url; + (*inner_counts)[current_count] = 4; + + return current_count + 1; +} + +int results_handler(UrlParams *params) { + TemplateContext ctx = new_context(); + char *raw_query = ""; + int page = 1; + + if (params) { + for (int i = 0; i < params->count; i++) { + if (strcmp(params->params[i].key, "q") == 0) { + raw_query = params->params[i].value; + } else if (strcmp(params->params[i].key, "p") == 0) { + int parsed = atoi(params->params[i].value); + if (parsed > 1) page = parsed; + } + } + } + + context_set(&ctx, "query", raw_query); + + char page_str[16], prev_str[16], next_str[16]; + snprintf(page_str, sizeof(page_str), "%d", page); + snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0); + snprintf(next_str, sizeof(next_str), "%d", page + 1); + context_set(&ctx, "page", page_str); + context_set(&ctx, "prev_page", prev_str); + context_set(&ctx, "next_page", next_str); + + if (!raw_query || strlen(raw_query) == 0) { + send_response("<h1>No query provided</h1>"); + free_context(&ctx); + return -1; + } + + pthread_t wiki_tid, calc_tid, dict_tid; + InfoBoxThreadData wiki_data = {.query = raw_query, .success = 0}; + InfoBoxThreadData calc_data = {.query = raw_query, .success = 0}; + InfoBoxThreadData dict_data = {.query = raw_query, .success = 0}; + + if (page == 1) { + pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data); + pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data); + pthread_create(&dict_tid, NULL, dict_thread_func, &dict_data); + } + + ScrapeJob jobs[ENGINE_COUNT]; + SearchResult *all_results[ENGINE_COUNT]; + + for (int i = 0; i < ENGINE_COUNT; i++) { + all_results[i] = NULL; + jobs[i].engine = &ENGINE_REGISTRY[i]; + jobs[i].query = raw_query; + jobs[i].out_results = &all_results[i]; + jobs[i].max_results = 10; + jobs[i].results_count = 0; + jobs[i].page = page; + } + + scrape_engines_parallel(jobs, ENGINE_COUNT); + + if (page == 1) { + pthread_join(wiki_tid, NULL); + pthread_join(calc_tid, NULL); + pthread_join(dict_tid, NULL); + } + + char ***infobox_matrix = NULL; + int *infobox_inner_counts = NULL; + int infobox_count = 0; + + if (page == 1) { + if (dict_data.success) { + infobox_count = add_infobox_to_collection(&dict_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } + + if (calc_data.success) { + infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } + + if (wiki_data.success) { + infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix, + &infobox_inner_counts, infobox_count); + } + } + + if (infobox_count > 0) { + context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix, + infobox_count, infobox_inner_counts); + free(infobox_matrix); + free(infobox_inner_counts); + } + + int total_results = 0; + for (int i = 0; i < ENGINE_COUNT; i++) { + total_results += jobs[i].results_count; + } + + if (total_results > 0) { + char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results); + int *results_inner_counts = (int *)malloc(sizeof(int) * total_results); + char **seen_urls = (char **)malloc(sizeof(char *) * total_results); + int unique_count = 0; + + for (int i = 0; i < ENGINE_COUNT; i++) { + for (int j = 0; j < jobs[i].results_count; j++) { + char *display_url = all_results[i][j].url; + + int is_duplicate = 0; + for (int k = 0; k < unique_count; k++) { + if (strcmp(seen_urls[k], display_url) == 0) { + is_duplicate = 1; + break; + } + } + + if (is_duplicate) { + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); + continue; + } + + seen_urls[unique_count] = strdup(display_url); + results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4); + char *pretty_url = pretty_display_url(display_url); + + results_matrix[unique_count][0] = strdup(display_url); + results_matrix[unique_count][1] = strdup(pretty_url); + results_matrix[unique_count][2] = all_results[i][j].title ? strdup(all_results[i][j].title) : strdup("Untitled"); + results_matrix[unique_count][3] = all_results[i][j].snippet ? strdup(all_results[i][j].snippet) : strdup(""); + + results_inner_counts[unique_count] = 4; + + free(pretty_url); + free(all_results[i][j].url); + free(all_results[i][j].title); + free(all_results[i][j].snippet); + + unique_count++; + } + free(all_results[i]); + } + + context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts); + + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } + + for (int i = 0; i < unique_count; i++) { + for (int j = 0; j < 4; j++) free(results_matrix[i][j]); + free(results_matrix[i]); + free(seen_urls[i]); + } + free(seen_urls); + free(results_matrix); + free(results_inner_counts); + } else { + char *html = render_template("results.html", &ctx); + if (html) { + send_response(html); + free(html); + } + } + + if (page == 1) { + if (wiki_data.success) free_infobox(&wiki_data.result); + if (calc_data.success) free_infobox(&calc_data.result); + if (dict_data.success) free_infobox(&dict_data.result); + } + free_context(&ctx); + + return 0; +}
\ No newline at end of file diff --git a/src/Routes/Search.h b/src/Routes/Search.h new file mode 100644 index 0000000..c6bc146 --- /dev/null +++ b/src/Routes/Search.h @@ -0,0 +1,8 @@ +#ifndef SEARCH_HANDLER_H +#define SEARCH_HANDLER_H + +#include <beaker.h> + +int results_handler(UrlParams *params); + +#endif diff --git a/src/Scraping/Scraping.c b/src/Scraping/Scraping.c new file mode 100644 index 0000000..42e05d6 --- /dev/null +++ b/src/Scraping/Scraping.c @@ -0,0 +1,459 @@ +#include "Scraping.h" +#include "../Utility/Unescape.h" +#include <curl/curl.h> +#include <libxml/HTMLparser.h> +#include <libxml/xpath.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + MemoryBuffer *mem = (MemoryBuffer *)userp; + + if (mem->size + realsize + 1 > mem->capacity) { + + size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2; + while (new_cap < mem->size + realsize + 1) new_cap *= 2; + + char *ptr = (char *)realloc(mem->memory, new_cap); + if (!ptr) { + return 0; + } + mem->memory = ptr; + mem->capacity = new_cap; + } + + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static const char *get_random_user_agent() { + static const char *agents[] = { + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, " + "like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like " + "Gecko) " + "Chrome/120.0.0.0` Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 " + "Firefox/121.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 " + "(KHTML, like Gecko) Version/17.2 Safari/605.1.15"}; + return agents[rand() % 5]; +} + +static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + return 0; + } + + const char *link_xpath = "//tr[not(contains(@class, 'result-sponsored'))]//a[@class='result-link']"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_links = xpathObj->nodesetval->nodeNr; + + int actual_alloc = (num_links < max_results) ? num_links : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_links && found_count < max_results; i++) { + xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i]; + char *title = (char *)xmlNodeGetContent(linkNode); + char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href"); + char *snippet_text = NULL; + + xmlNodePtr current = linkNode->parent; + while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0) + current = current->parent; + + if (current && current->next) { + xmlNodePtr snippetRow = current->next; + while (snippetRow && + xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0) + snippetRow = snippetRow->next; + if (snippetRow) { + + xpathCtx->node = snippetRow; + xmlXPathObjectPtr sObj = xmlXPathEvalExpression( + (xmlChar *)".//td[@class='result-snippet']", xpathCtx); + if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) { + snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]); + } + if (sObj) xmlXPathFreeObject(sObj); + xpathCtx->node = NULL; + + } + } + + (*out_results)[found_count].url = unescape_search_url(url); + (*out_results)[found_count].title = strdup(title ? title : "No Title"); + (*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : ""); + + found_count++; + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + } + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_startpage(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'result')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xpathCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//p[contains(@class, 'description')]", xpathCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = strdup(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + found_count++; + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + } + + xpathCtx->node = NULL; + + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +static int parse_yahoo(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results) { + (void)engine_name; + int found_count = 0; + xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); + if (!xpathCtx) { + return 0; + } + + const char *container_xpath = "//div[contains(@class, 'algo-sr')]"; + xmlXPathObjectPtr xpathObj = + xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx); + + if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) { + if (xpathObj) xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + int num_results = xpathObj->nodesetval->nodeNr; + + int actual_alloc = (num_results < max_results) ? num_results : max_results; + *out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult)); + if (!*out_results) { + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return 0; + } + + for (int i = 0; i < num_results && found_count < max_results; i++) { + xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i]; + xpathCtx->node = resultNode; + + xmlXPathObjectPtr linkObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']", + xpathCtx); + char *url = + (linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0) + ? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0], + (xmlChar *)"href") + : NULL; + + xmlXPathObjectPtr titleObj = xmlXPathEvalExpression( + (xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx); + char *title = + (titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0]) + : NULL; + + xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression( + (xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx); + char *snippet_text = + (snippetObj && snippetObj->nodesetval && + snippetObj->nodesetval->nodeNr > 0) + ? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0]) + : NULL; + + if (url && title) { + (*out_results)[found_count].url = unescape_search_url(url); + (*out_results)[found_count].title = strdup(title); + (*out_results)[found_count].snippet = + strdup(snippet_text ? snippet_text : ""); + found_count++; + } + + if (title) xmlFree(title); + if (url) xmlFree(url); + if (snippet_text) xmlFree(snippet_text); + if (linkObj) xmlXPathFreeObject(linkObj); + if (titleObj) xmlXPathFreeObject(titleObj); + if (snippetObj) xmlXPathFreeObject(snippetObj); + } + + xpathCtx->node = NULL; + xmlXPathFreeObject(xpathObj); + xmlXPathFreeContext(xpathCtx); + return found_count; +} + +const SearchEngine ENGINE_REGISTRY[] = { + {.name = "DuckDuckGo Lite", + .base_url = "https://lite.duckduckgo.com/lite/?q=", + .host_header = "lite.duckduckgo.com", + .referer = "https://lite.duckduckgo.com/", + .page_param = "s", + .page_multiplier = 30, + .page_base = 0, + .parser = parse_ddg_lite}, + {.name = "Startpage", + .base_url = "https://www.startpage.com/sp/search?q=", + .host_header = "www.startpage.com", + .referer = "https://www.startpage.com/", + .page_param = "page", + .page_multiplier = 1, + .page_base = 1, + .parser = parse_startpage}, + {.name = "Yahoo", + .base_url = "https://search.yahoo.com/search?p=", + .host_header = "search.yahoo.com", + .referer = "https://search.yahoo.com/", + .page_param = "b", + .page_multiplier = 10, + .page_base = 1, + .parser = parse_yahoo}}; + +const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine); + +static void configure_curl_handle(CURL *curl, const char *full_url, + MemoryBuffer *chunk, + struct curl_slist *headers) { + curl_easy_setopt(curl, CURLOPT_URL, full_url); + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk); + curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent()); + + curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); + + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, ""); + + curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L); + + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L); + curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); +} + +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) { + CURLM *multi_handle = curl_multi_init(); + if (!multi_handle) { + return -1; + } + + for (int i = 0; i < num_jobs; i++) { + ScrapeJob *job = &jobs[i]; + job->handle = curl_easy_init(); + if (!job->handle) { + continue; + } + + job->response.memory = (char *)malloc(16384); + job->response.size = 0; + job->response.capacity = 16384; + + char full_url[1024]; + char *encoded_query = curl_easy_escape(job->handle, job->query, 0); + if (!encoded_query) { + curl_easy_cleanup(job->handle); + job->handle = NULL; + continue; + } + + int page = (job->page < 1) ? 1 : job->page; + int page_value = (page - 1) * job->engine->page_multiplier + job->engine->page_base; + + snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", + job->engine->base_url, + encoded_query, + job->engine->page_param, + page_value); + curl_free(encoded_query); + + struct curl_slist *headers = NULL; + char host_buf[256], ref_buf[256]; + snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header); + snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer); + headers = curl_slist_append(headers, host_buf); + headers = curl_slist_append(headers, ref_buf); + headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); + headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5"); + headers = curl_slist_append(headers, "DNT: 1"); + + configure_curl_handle(job->handle, full_url, &job->response, headers); + + curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers); + + curl_multi_add_handle(multi_handle, job->handle); + } + + usleep(100000 + (rand() % 100000)); + + int still_running = 0; + curl_multi_perform(multi_handle, &still_running); + + do { + int numfds = 0; + CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds); + + if (mc != CURLM_OK) { + break; + } + + curl_multi_perform(multi_handle, &still_running); + } while (still_running); + + CURLMsg *msg; + int msgs_left; + while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) { + if (msg->msg == CURLMSG_DONE) { + CURL *handle = msg->easy_handle; + + for (int i = 0; i < num_jobs; i++) { + if (jobs[i].handle == handle) { + ScrapeJob *job = &jobs[i]; + + long response_code; + curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code); + + if (msg->data.result == CURLE_OK && job->response.size > 0) { + xmlDocPtr doc = htmlReadMemory( + job->response.memory, job->response.size, NULL, NULL, + HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); + + if (doc) { + job->results_count = job->engine->parser( + job->engine->name, doc, job->out_results, job->max_results); + xmlFreeDoc(doc); + } + } else { + job->results_count = 0; + } + + struct curl_slist *headers; + curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers); + if (headers) curl_slist_free_all(headers); + + free(job->response.memory); + curl_multi_remove_handle(multi_handle, handle); + curl_easy_cleanup(handle); + break; + } + } + } + } + + curl_multi_cleanup(multi_handle); + return 0; +} + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results) { + ScrapeJob job = { + .engine = engine, + .query = (char *)query, + .out_results = out_results, + .max_results = max_results, + .results_count = 0, + .page = 1 + }; + + scrape_engines_parallel(&job, 1); + return job.results_count; +}
\ No newline at end of file diff --git a/src/Scraping/Scraping.h b/src/Scraping/Scraping.h new file mode 100644 index 0000000..d8a3b13 --- /dev/null +++ b/src/Scraping/Scraping.h @@ -0,0 +1,58 @@ +#ifndef SCRAPING_H +#define SCRAPING_H + +#include <libxml/HTMLparser.h> +#include <curl/curl.h> + +#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__) +#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__) +#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__) +#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__) + +typedef struct { + char *url; + char *title; + char *snippet; +} SearchResult; + +typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc, + SearchResult **out_results, int max_results); + +typedef struct { + const char *name; + const char *base_url; + const char *host_header; + const char *referer; + + const char *page_param; + int page_multiplier; + int page_base; + ParserFunc parser; +} SearchEngine; + +typedef struct { + char *memory; + size_t size; + size_t capacity; +} MemoryBuffer; + +typedef struct { + const SearchEngine *engine; + char *query; + SearchResult **out_results; + int max_results; + int page; + CURL *handle; + MemoryBuffer response; + int results_count; +} ScrapeJob; + +extern const SearchEngine ENGINE_REGISTRY[]; +extern const int ENGINE_COUNT; + +int scrape_engine(const SearchEngine *engine, const char *query, + SearchResult **out_results, int max_results); + +int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs); + +#endif
\ No newline at end of file diff --git a/src/Utility/Display.c b/src/Utility/Display.c new file mode 100644 index 0000000..492e998 --- /dev/null +++ b/src/Utility/Display.c @@ -0,0 +1,46 @@ +#include "Display.h" +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> + +char *pretty_display_url(const char *input) { + if (!input) return NULL; + + const char *start = input; + + const char *protocol_pos = strstr(input, "://"); + if (protocol_pos) { + start = protocol_pos + 3; + } + + if (strncasecmp(start, "www.", 4) == 0) { + start += 4; + } + + size_t input_len = strlen(start); + char temp[512]; + strncpy(temp, start, sizeof(temp) - 1); + temp[sizeof(temp) - 1] = '\0'; + + if (input_len > 0 && temp[input_len - 1] == '/') { + temp[input_len - 1] = '\0'; + } + + char *output = (char *)malloc(strlen(temp) * 3 + 1); + if (!output) return NULL; + + size_t j = 0; + for (size_t i = 0; temp[i] != '\0'; i++) { + if (temp[i] == '/') { + output[j++] = ' '; + output[j++] = '>'; + output[j++] = ' '; + } else { + output[j++] = (char)tolower((unsigned char)temp[i]); + } + } + output[j] = '\0'; + + return output; +} diff --git a/src/Utility/Display.h b/src/Utility/Display.h new file mode 100644 index 0000000..bbaf421 --- /dev/null +++ b/src/Utility/Display.h @@ -0,0 +1,6 @@ +#ifndef DISPLAY_H +#define DISPLAY_H + +char *pretty_display_url(const char *input); + +#endif diff --git a/src/Utility/Unescape.c b/src/Utility/Unescape.c new file mode 100644 index 0000000..e2968b2 --- /dev/null +++ b/src/Utility/Unescape.c @@ -0,0 +1,80 @@ +#include "Unescape.h" +#include "Utility.h" +#include <stdlib.h> +#include <string.h> + +char *unescape_search_url(const char *input) { + if (!input) return NULL; + + const char *key = NULL; + const char *start = NULL; + const char *end = NULL; + size_t len = 0; + + if (strstr(input, "uddg=")) { + key = "uddg="; + start = strstr(input, key); + if (!start) return NULL; + start += strlen(key); + end = strchr(start, '&'); + len = end ? (size_t)(end - start) : strlen(start); + } + + else if (strstr(input, "RU=")) { + key = "RU="; + start = strstr(input, key); + if (!start) return strdup(input); + start += strlen(key); + end = strchr(start, '/'); + len = end ? (size_t)(end - start) : strlen(start); + } + + else { + return strdup(input); + } + + char *output = (char *)malloc(len * 3 + 1); + if (!output) return NULL; + + size_t i = 0, j = 0; + while (i < len) { + if (start[i] == '%' && i + 2 < len) { + int high = hex_to_int(start[i + 1]); + int low = hex_to_int(start[i + 2]); + if (high != -1 && low != -1) { + output[j++] = (char)((high << 4) | low); + i += 3; + } else { + output[j++] = start[i++]; + } + } else if (start[i] == '+') { + output[j++] = ' '; + i++; + } else { + output[j++] = start[i++]; + } + } + output[j] = '\0'; + + return output; +} + +char *url_decode_query(const char *src) { + if (!src) return NULL; + char *res = strdup(src); + char *p = res; + while (*src) { + if (*src == '+') { + *p++ = ' '; + } else if (*src == '%' && src[1] && src[2]) { + char hex[3] = {src[1], src[2], '\0'}; + *p++ = (char)strtol(hex, NULL, 16); + src += 2; + } else { + *p++ = *src; + } + src++; + } + *p = '\0'; + return res; +} diff --git a/src/Utility/Unescape.h b/src/Utility/Unescape.h new file mode 100644 index 0000000..0adb228 --- /dev/null +++ b/src/Utility/Unescape.h @@ -0,0 +1,10 @@ +#ifndef UNESCAPE_H +#define UNESCAPE_H + +#include <stddef.h> + +char *unescape_search_url(const char *input); +char *url_decode_query(const char *src); + +#endif + diff --git a/src/Utility/Utility.c b/src/Utility/Utility.c new file mode 100644 index 0000000..8e5af92 --- /dev/null +++ b/src/Utility/Utility.c @@ -0,0 +1,8 @@ +#include "Utility.h" + +int hex_to_int(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} diff --git a/src/Utility/Utility.h b/src/Utility/Utility.h new file mode 100644 index 0000000..3b0181c --- /dev/null +++ b/src/Utility/Utility.h @@ -0,0 +1,6 @@ +#ifndef UTILITY_H +#define UTILITY_H + +int hex_to_int(char c); + +#endif |
