#include #include #include #include #include #include // Define the alumno struct struct alumno { char apellido_paterno[100]; char apellido_materno[100]; char curp[20]; char clave_carrera[2]; char plan[2]; char clave[6]; char nombre[100]; char correo[100]; char estatus; char telefono[11]; int semestre; char sexo; }; // Function to check for PostgreSQL connection errors void check_conn_status(PGconn *conn) { if (PQstatus(conn) != CONNECTION_OK) { fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn)); PQfinish(conn); exit(EXIT_FAILURE); } } // Function to check for PostgreSQL query execution errors void check_exec_status(PGresult *res, PGconn *conn) { if (PQresultStatus(res) != PGRES_TUPLES_OK) { fprintf(stderr, "Query failed: %s", PQerrorMessage(conn)); PQclear(res); PQfinish(conn); exit(EXIT_FAILURE); } } // Function to extract content from an HTML element by ID char* get_element_content_by_id(htmlDocPtr doc, const char *id) { xmlChar xpath[100]; snprintf((char *)xpath, sizeof(xpath), "//*[@id='%s']", id); xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc); xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(xpath, xpathCtx); if (xpathObj == NULL || xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) { xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return NULL; } xmlNodePtr node = xpathObj->nodesetval->nodeTab[0]; xmlChar *content = xmlNodeGetContent(node); xmlXPathFreeObject(xpathObj); xmlXPathFreeContext(xpathCtx); return (char *)content; } // Function to parse HTML content using libxml2 and populate the alumno struct void parse_html(const char *html, struct alumno *alum) { htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if (doc == NULL) { fprintf(stderr, "Failed to parse HTML\n"); return; } char *content; content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApPatAlumnoHP"); if (content) { strncpy(alum->apellido_paterno, content, 100); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApMatAlumnoHP"); if (content) { strncpy(alum->apellido_materno, content, 100); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCURPAlumnoHP"); if (content) { strncpy(alum->curp, content, 20); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveCarrera"); if (content) { strncpy(alum->clave_carrera, content, 2); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblAlupla"); if (content) { strncpy(alum->plan, content, 4); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveUlsa"); if (content) { strncpy(alum->clave, content, 7); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblNombreAlumnoHP"); if (content) { strncpy(alum->nombre, content, 100); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCorreoAlumnoHP"); if (content) { strncpy(alum->correo, content, 100); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblStat"); if (content) { alum->estatus = content[0]; xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblTelefonoAlumnoHP"); if (content) { strncpy(alum->telefono, content, 11); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblSem"); if (content) { alum->semestre = atoi(content); xmlFree(content); } content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblSexoAlumnoHP"); if (content) { alum->sexo = content[0]; xmlFree(content); } xmlFreeDoc(doc); } int main() { // PostgreSQL connection parameters const char *conninfo = "dbname=sgi user=postgres password=h3rcul3s#$ hostaddr=200.13.89.8 port=5432"; PGconn *conn = PQconnectdb(conninfo); // Check connection status check_conn_status(conn); // Execute SQL query to retrieve HTML content PGresult *res = PQexec(conn, "SELECT datos_html FROM public.alumno_extraccion WHERE error_message IS NULL"); check_exec_status(res, conn); // Process each row int rows = PQntuples(res); for (int i = 0; i < rows; i++) { char *html_content = PQgetvalue(res, i, 0); // printf("HTML Content: %s\n", html_content); struct alumno alum; memset(&alum, 0, sizeof(alum)); // Initialize the struct to zero parse_html(html_content, &alum); printf("Apellido Paterno: %s\n", alum.apellido_paterno); printf("Apellido Materno: %s\n", alum.apellido_materno); printf("CURP: %s\n", alum.curp); printf("Clave Carrera: %s\n", alum.clave_carrera); printf("Plan: %s\n", alum.plan); printf("Clave: %s\n", alum.clave); printf("Nombre: %s\n", alum.nombre); printf("Correo: %s\n", alum.correo); printf("Estatus: %c\n", alum.estatus); printf("Telefono: %s\n", alum.telefono); printf("Semestre: %d\n", alum.semestre); printf("Sexo: %c\n", alum.sexo); } // Clean up PQclear(res); PQfinish(conn); return 0; }