extraccion_html.c 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. #include <stdio.h>
  2. #include <stdlib.h>
  3. #include <string.h>
  4. #include <libpq-fe.h>
  5. #include <libxml/HTMLparser.h>
  6. #include <libxml/xpath.h>
  7. // Define the alumno struct
  8. struct alumno {
  9. char apellido_paterno[100];
  10. char apellido_materno[100];
  11. char curp[20];
  12. char clave_carrera[2];
  13. char plan[2];
  14. char clave[6];
  15. char nombre[100];
  16. char correo[100];
  17. char estatus;
  18. char telefono[11];
  19. int semestre;
  20. char sexo;
  21. };
  22. // Function to check for PostgreSQL connection errors
  23. void check_conn_status(PGconn *conn) {
  24. if (PQstatus(conn) != CONNECTION_OK) {
  25. fprintf(stderr, "Connection to database failed: %s", PQerrorMessage(conn));
  26. PQfinish(conn);
  27. exit(EXIT_FAILURE);
  28. }
  29. }
  30. // Function to check for PostgreSQL query execution errors
  31. void check_exec_status(PGresult *res, PGconn *conn) {
  32. if (PQresultStatus(res) != PGRES_TUPLES_OK) {
  33. fprintf(stderr, "Query failed: %s", PQerrorMessage(conn));
  34. PQclear(res);
  35. PQfinish(conn);
  36. exit(EXIT_FAILURE);
  37. }
  38. }
  39. // Function to extract content from an HTML element by ID
  40. char* get_element_content_by_id(htmlDocPtr doc, const char *id) {
  41. xmlChar xpath[100];
  42. snprintf((char *)xpath, sizeof(xpath), "//*[@id='%s']", id);
  43. xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
  44. xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression(xpath, xpathCtx);
  45. if (xpathObj == NULL || xmlXPathNodeSetIsEmpty(xpathObj->nodesetval)) {
  46. xmlXPathFreeObject(xpathObj);
  47. xmlXPathFreeContext(xpathCtx);
  48. return NULL;
  49. }
  50. xmlNodePtr node = xpathObj->nodesetval->nodeTab[0];
  51. xmlChar *content = xmlNodeGetContent(node);
  52. xmlXPathFreeObject(xpathObj);
  53. xmlXPathFreeContext(xpathCtx);
  54. return (char *)content;
  55. }
  56. // Function to parse HTML content using libxml2 and populate the alumno struct
  57. void parse_html(const char *html, struct alumno *alum) {
  58. htmlDocPtr doc = htmlReadMemory(html, strlen(html), NULL, NULL, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
  59. if (doc == NULL) {
  60. fprintf(stderr, "Failed to parse HTML\n");
  61. return;
  62. }
  63. char *content;
  64. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApPatAlumnoHP");
  65. if (content) {
  66. strncpy(alum->apellido_paterno, content, 100);
  67. xmlFree(content);
  68. }
  69. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblApMatAlumnoHP");
  70. if (content) {
  71. strncpy(alum->apellido_materno, content, 100);
  72. xmlFree(content);
  73. }
  74. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCURPAlumnoHP");
  75. if (content) {
  76. strncpy(alum->curp, content, 20);
  77. xmlFree(content);
  78. }
  79. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveCarrera");
  80. if (content) {
  81. strncpy(alum->clave_carrera, content, 2);
  82. xmlFree(content);
  83. }
  84. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblAlupla");
  85. if (content) {
  86. strncpy(alum->plan, content, 4);
  87. xmlFree(content);
  88. }
  89. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblCveUlsa");
  90. if (content) {
  91. strncpy(alum->clave, content, 7);
  92. xmlFree(content);
  93. }
  94. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblNombreAlumnoHP");
  95. if (content) {
  96. strncpy(alum->nombre, content, 100);
  97. xmlFree(content);
  98. }
  99. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblCorreoAlumnoHP");
  100. if (content) {
  101. strncpy(alum->correo, content, 100);
  102. xmlFree(content);
  103. }
  104. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblStat");
  105. if (content) {
  106. alum->estatus = content[0];
  107. xmlFree(content);
  108. }
  109. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblTelefonoAlumnoHP");
  110. if (content) {
  111. strncpy(alum->telefono, content, 11);
  112. xmlFree(content);
  113. }
  114. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_Header1_lblSem");
  115. if (content) {
  116. alum->semestre = atoi(content);
  117. xmlFree(content);
  118. }
  119. content = get_element_content_by_id(doc, "ctl00_contenedor_HistorialAlumno1_lblSexoAlumnoHP");
  120. if (content) {
  121. alum->sexo = content[0];
  122. xmlFree(content);
  123. }
  124. xmlFreeDoc(doc);
  125. }
  126. int main() {
  127. // PostgreSQL connection parameters
  128. const char *conninfo = "dbname=sgi user=postgres password=h3rcul3s#$ hostaddr=200.13.89.8 port=5432";
  129. PGconn *conn = PQconnectdb(conninfo);
  130. // Check connection status
  131. check_conn_status(conn);
  132. // Execute SQL query to retrieve HTML content
  133. PGresult *res = PQexec(conn, "SELECT datos_html FROM public.alumno_extraccion WHERE error_message IS NULL");
  134. check_exec_status(res, conn);
  135. // Process each row
  136. int rows = PQntuples(res);
  137. for (int i = 0; i < rows; i++) {
  138. char *html_content = PQgetvalue(res, i, 0);
  139. // printf("HTML Content: %s\n", html_content);
  140. struct alumno alum;
  141. memset(&alum, 0, sizeof(alum)); // Initialize the struct to zero
  142. parse_html(html_content, &alum);
  143. printf("Apellido Paterno: %s\n", alum.apellido_paterno);
  144. printf("Apellido Materno: %s\n", alum.apellido_materno);
  145. printf("CURP: %s\n", alum.curp);
  146. printf("Clave Carrera: %s\n", alum.clave_carrera);
  147. printf("Plan: %s\n", alum.plan);
  148. printf("Clave: %s\n", alum.clave);
  149. printf("Nombre: %s\n", alum.nombre);
  150. printf("Correo: %s\n", alum.correo);
  151. printf("Estatus: %c\n", alum.estatus);
  152. printf("Telefono: %s\n", alum.telefono);
  153. printf("Semestre: %d\n", alum.semestre);
  154. printf("Sexo: %c\n", alum.sexo);
  155. }
  156. // Clean up
  157. PQclear(res);
  158. PQfinish(conn);
  159. return 0;
  160. }