diff options
Diffstat (limited to 'src/Infobox/Wikipedia.c')
| -rw-r--r-- | src/Infobox/Wikipedia.c | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/src/Infobox/Wikipedia.c b/src/Infobox/Wikipedia.c new file mode 100644 index 0000000..ed4645f --- /dev/null +++ b/src/Infobox/Wikipedia.c @@ -0,0 +1,165 @@ +#include "Wikipedia.h" +#include <curl/curl.h> +#include <libxml/parser.h> +#include <libxml/tree.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +struct WikiMemoryStruct { + char *memory; + size_t size; +}; + +static void shorten_summary(char **extract_ptr, int max_chars) { + if (!extract_ptr || !*extract_ptr) return; + + char *text = *extract_ptr; + int len = strlen(text); + + if (len <= max_chars) return; + + int end_pos = max_chars; + for (int i = max_chars; i > (max_chars / 2); i--) { + if (text[i] == '.' || text[i] == '!' || text[i] == '?') { + end_pos = i + 1; + break; + } + } + + char *new_text = (char *)malloc(end_pos + 4); + + if (new_text) { + strncpy(new_text, text, end_pos); + new_text[end_pos] = '\0'; + strcat(new_text, "..."); + free(*extract_ptr); + *extract_ptr = new_text; + } +} + +static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb, + void *userp) { + size_t realsize = size * nmemb; + struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp; + + char *ptr = realloc(mem->memory, mem->size + realsize + 1); + if (ptr == NULL) { + fprintf(stderr, "Not enough memory (realloc returned NULL)\n"); + return 0; + } + + mem->memory = ptr; + memcpy(&(mem->memory[mem->size]), contents, realsize); + mem->size += realsize; + mem->memory[mem->size] = 0; + + return realsize; +} + +static void extract_wiki_info(xmlNode *node, InfoBox *info) { + xmlNode *cur_node = NULL; + + for (cur_node = node; cur_node; cur_node = cur_node->next) { + if (cur_node->type == XML_ELEMENT_NODE) { + if (strcmp((const char *)cur_node->name, "page") == 0) { + xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title"); + if (title) { + info->title = strdup((const char *)title); + + const char *base_article_url = "https://en.wikipedia.org/wiki/"; + char *formatted_title = strdup((const char *)title); + for (int i = 0; formatted_title[i]; i++) { + if (formatted_title[i] == ' ') formatted_title[i] = '_'; + } + + info->url = + malloc(strlen(base_article_url) + strlen(formatted_title) + 1); + if (info->url) { + strcpy(info->url, base_article_url); + strcat(info->url, formatted_title); + } + free(formatted_title); + xmlFree(title); + } + } + + if (strcmp((const char *)cur_node->name, "thumbnail") == 0) { + xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source"); + if (source) { + info->thumbnail_url = strdup((const char *)source); + xmlFree(source); + } + } + + if (strcmp((const char *)cur_node->name, "extract") == 0) { + xmlChar *content = xmlNodeGetContent(cur_node); + if (content) { + info->extract = strdup((const char *)content); + + shorten_summary(&(info->extract), 300); + xmlFree(content); + } + } + } + extract_wiki_info(cur_node->children, info); + } +} + +InfoBox fetch_wiki_data(char *api_url) { + CURL *curl_handle; + CURLcode res; + struct WikiMemoryStruct chunk; + InfoBox info = {NULL, NULL, NULL, NULL}; + + chunk.memory = malloc(1); + chunk.size = 0; + + curl_handle = curl_easy_init(); + + if (curl_handle) { + curl_easy_setopt(curl_handle, CURLOPT_URL, api_url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, + WikiWriteMemoryCallback); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0"); + + res = curl_easy_perform(curl_handle); + + if (res == CURLE_OK) { + xmlDocPtr doc = + xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0); + if (doc != NULL) { + xmlNode *root_element = xmlDocGetRootElement(doc); + extract_wiki_info(root_element, &info); + xmlFreeDoc(doc); + } + } + + curl_easy_cleanup(curl_handle); + free(chunk.memory); + } + + return info; +} + +char *construct_wiki_url(const char *search_term) { + CURL *curl = curl_easy_init(); + if (!curl) return NULL; + + char *escaped_term = curl_easy_escape(curl, search_term, 0); + const char *base = + "https://en.wikipedia.org/w/" + "api.php?action=query&prop=extracts|pageimages&exintro&" + "explaintext&pithumbsize=400&format=xml&origin=*&titles="; + + char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1); + if (full_url) { + strcpy(full_url, base); + strcat(full_url, escaped_term); + } + + curl_free(escaped_term); + curl_easy_cleanup(curl); + return full_url; +} |
