123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <libpq-fe.h>
- #include <libxml/HTMLparser.h>
- #include <libxml/xpath.h>
- // Define the alumno struct
- struct alumno {
- char apellido_paterno[100];
- char apellido_materno[100];
- char curp[20];
- char clave_carrera[2];
- char plan[2];
- char clave[6];
- char nombre[100];
- char correo[100];
- char estatus;
- char telefono[11];
- int semestre;
- char sexo;
- };
- // Function to check for PostgreSQL connection errors
- void check_conn_status(PGconn *conn) {
- if (PQstatus(conn) != CONNECTION_OK) {
- fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
- PQfinish(conn);
- exit(EXIT_FAILURE);
- }
- }
- // Function to check for PostgreSQL query execution errors
- void check_exec_status(PGresult *res, PGconn *conn) {
- if (PQresultStatus(res) != PGRES_TUPLES_OK) {
- fprintf(stderr, "Query failed: %s", PQerrorMessage(conn));
- PQclear(res);
- PQfinish(conn);
- exit(EXIT_FAILURE);
- }
- }
- // Function to extract content from an HTML element by ID
- char* get_element_content_by_id(htmlDocPtr doc, const char *id) {
- xmlChar xpath[100];
- snprintf((char *)xpath, sizeof(xpath), "//*[@id='%s']", id);
-
- xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
- xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(xpath, xpathCtx);
-
- if (xpathObj == NULL || xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return NULL;
- }
- xmlNodePtr node = xpathObj->nodesetval->nodeTab[0];
- xmlChar *content = xmlNodeGetContent(node);
- xmlXPathFreeObject(xpathObj);
- xmlXPathFreeContext(xpathCtx);
- return (char *)content;
- }
- // Function to parse HTML content using libxml2 and populate the alumno struct
- void parse_html(const char *html, struct alumno *alum) {
- htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
- if (doc == NULL) {
- fprintf(stderr, "Failed to parse HTML\n");
- return;
- }
- char *content;
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApPatAlumnoHP");
- if (content) {
- strncpy(alum->apellido_paterno, content, 100);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApMatAlumnoHP");
- if (content) {
- strncpy(alum->apellido_materno, content, 100);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCURPAlumnoHP");
- if (content) {
- strncpy(alum->curp, content, 20);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveCarrera");
- if (content) {
- strncpy(alum->clave_carrera, content, 2);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblAlupla");
- if (content) {
- strncpy(alum->plan, content, 4);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveUlsa");
- if (content) {
- strncpy(alum->clave, content, 7);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblNombreAlumnoHP");
- if (content) {
- strncpy(alum->nombre, content, 100);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCorreoAlumnoHP");
- if (content) {
- strncpy(alum->correo, content, 100);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblStat");
- if (content) {
- alum->estatus = content[0];
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblTelefonoAlumnoHP");
- if (content) {
- strncpy(alum->telefono, content, 11);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblSem");
- if (content) {
- alum->semestre = atoi(content);
- xmlFree(content);
- }
- content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblSexoAlumnoHP");
- if (content) {
- alum->sexo = content[0];
- xmlFree(content);
- }
- xmlFreeDoc(doc);
- }
- int main() {
- // PostgreSQL connection parameters
- const char *conninfo = "dbname=sgi user=postgres password=h3rcul3s#$ hostaddr=200.13.89.8 port=5432";
- PGconn *conn = PQconnectdb(conninfo);
- // Check connection status
- check_conn_status(conn);
- // Execute SQL query to retrieve HTML content
- PGresult *res = PQexec(conn, "SELECT datos_html FROM public.alumno_extraccion WHERE error_message IS NULL");
- check_exec_status(res, conn);
- // Process each row
- int rows = PQntuples(res);
- for (int i = 0; i < rows; i++) {
- char *html_content = PQgetvalue(res, i, 0);
- // printf("HTML Content: %s\n", html_content);
- struct alumno alum;
- memset(&alum, 0, sizeof(alum)); // Initialize the struct to zero
- parse_html(html_content, &alum);
- printf("Apellido Paterno: %s\n", alum.apellido_paterno);
- printf("Apellido Materno: %s\n", alum.apellido_materno);
- printf("CURP: %s\n", alum.curp);
- printf("Clave Carrera: %s\n", alum.clave_carrera);
- printf("Plan: %s\n", alum.plan);
- printf("Clave: %s\n", alum.clave);
- printf("Nombre: %s\n", alum.nombre);
- printf("Correo: %s\n", alum.correo);
- printf("Estatus: %c\n", alum.estatus);
- printf("Telefono: %s\n", alum.telefono);
- printf("Semestre: %d\n", alum.semestre);
- printf("Sexo: %c\n", alum.sexo);
- }
- // Clean up
- PQclear(res);
- PQfinish(conn);
- return 0;
- }
|