Merge remote-tracking branch 'origin/master'

hakiour · Oct 6, 2019 · f791247 · f791247
2 parents 906e3c5 + e6119fd
commit f791247
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 41 deletions.
diff --git a/linkDownload.py b/linkDownload.py
@@ -1,5 +1,5 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
+#!C:/Users/sergi/AppData/Local/Programs/Python/Python37-32/python.exe
+print("Content-Type: text/html\n")
 
 from urllib.request import Request, urlopen
 import re
@@ -52,20 +52,21 @@ def urlToList(url, dominio):
 
 
 def readLinks():
-
 	rutaGoogle = 'http://www.google.com/search?btnG=1&q=site%3A'
 	datosForm = cgi.FieldStorage()
 
 	if datosForm:
-		url = datosForm["url"]
+		url = datosForm["url"].value
 		arrayUrl = (url.split("//"))
 		if len(arrayUrl) == 2: 
 			url = arrayUrl[1]
 		else: 
 			print('Error url')
 			return 0
 
-		tema = datosForm["tema"]
+		print(url)
+
+		tema = datosForm["tema"].value
 
 		pagina = requests.get(rutaGoogle + url)
 		tree = html.fromstring(pagina.content)
@@ -85,14 +86,16 @@ def readLinks():
 
 		for link in arrayUrls[:-1]:
 			print("reading link: "+ link)
-			dicPalabrasLink = urlToList(link, urlSrc)
-			busqueda["LINKS"][link]=dicPalabrasLink[link]				#añadimos entrada de url: palabras.
-
-		return busqueda
-
+			urlToList(link, "test"+str(i), url)
+			i+=1
+
+
 	else:
 		print('Error input')
 		return None
 
-#print(readLinks())
+#url = "https://www.elmundo.es/pais-vasco/2019/07/10/5d25f9af21efa0c0578b456f.html"
+#urlToList(url,file)
 
+readLinks()
+#map reduce para contar palabras
diff --git a/terrassahash.sql b/terrassahash.sql
@@ -1,11 +1,11 @@
 -- phpMyAdmin SQL Dump
--- version 4.9.0.1
+-- version 4.8.5
 -- https://www.phpmyadmin.net/
 --
--- Servidor: 127.0.0.1
--- Tiempo de generación: 06-10-2019 a las 00:41:32
--- Versión del servidor: 10.4.6-MariaDB
--- Versión de PHP: 7.3.9
+-- Host: 127.0.0.1
+-- Generation Time: Oct 06, 2019 at 02:02 AM
+-- Server version: 10.1.38-MariaDB
+-- PHP Version: 7.3.2
 
 SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
 SET AUTOCOMMIT = 0;
@@ -19,28 +19,40 @@ SET time_zone = "+00:00";
 /*!40101 SET NAMES utf8mb4 */;
 
 --
--- Base de datos: `terrassahash`
+-- Database: `terrassahash`
 --
 
 -- --------------------------------------------------------
 
 --
--- Estructura de tabla para la tabla `dominio`
+-- Table structure for table `dominio`
 --
 
 CREATE TABLE `dominio` (
   `idDominio` int(11) NOT NULL,
-  `fecha` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
+  `fecha` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
   `ipUsuario` varchar(45) NOT NULL,
   `nombreDominio` varchar(100) NOT NULL,
   `tematica` varchar(100) NOT NULL,
-  `created` timestamp NOT NULL DEFAULT current_timestamp()
+  `created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
 
+--
+-- Dumping data for table `dominio`
+--
+
+INSERT INTO `dominio` (`idDominio`, `fecha`, `ipUsuario`, `nombreDominio`, `tematica`, `created`) VALUES
+(1, '2019-10-05 22:44:48', '192.168.1.9', 'lavanguardia.com', 'racismo', '2019-10-05 22:44:48'),
+(2, '2019-10-05 22:45:07', '192.168.1.9', 'lavanguardia.com', 'racismo', '2019-10-05 22:45:07'),
+(3, '2019-10-05 22:46:48', '192.168.1.9', 'facebook.com', 'racismo', '2019-10-05 22:46:48'),
+(4, '2019-10-05 22:46:48', '192.168.1.9', 'twitter.com', 'racismo', '2019-10-05 22:46:48'),
+(5, '2019-10-05 22:46:48', '192.168.1.9', 'mamuthack.com', 'racismo', '2019-10-05 22:46:48'),
+(6, '2019-10-05 22:46:48', '192.168.1.9', 'https://www.forocoches.com/foro/showthread.php?t=6539653&page=2', 'racismo', '2019-10-05 22:46:48');
+
 -- --------------------------------------------------------
 
 --
--- Estructura de tabla para la tabla `link`
+-- Table structure for table `link`
 --
 
 CREATE TABLE `link` (
@@ -49,10 +61,18 @@ CREATE TABLE `link` (
   `indice` float NOT NULL
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
 
+--
+-- Dumping data for table `link`
+--
+
+INSERT INTO `link` (`idLink`, `idDominio`, `indice`) VALUES
+(1, 1, 1),
+(2, 2, 2);
+
 -- --------------------------------------------------------
 
 --
--- Estructura de tabla para la tabla `palabras`
+-- Table structure for table `palabras`
 --
 
 CREATE TABLE `palabras` (
@@ -64,62 +84,70 @@ CREATE TABLE `palabras` (
 ) ENGINE=InnoDB DEFAULT CHARSET=latin1;
 
 --
--- Índices para tablas volcadas
+-- Dumping data for table `palabras`
+--
+
+INSERT INTO `palabras` (`idPalabras`, `Palabra`, `tipo`, `numeroVeces`, `idLink`) VALUES
+(1, 'puto', '1', 23, 1),
+(2, 'negrata', '2', 12, 2);
+
+--
+-- Indexes for dumped tables
 --
 
 --
--- Indices de la tabla `dominio`
+-- Indexes for table `dominio`
 --
 ALTER TABLE `dominio`
   ADD PRIMARY KEY (`idDominio`);
 
 --
--- Indices de la tabla `link`
+-- Indexes for table `link`
 --
 ALTER TABLE `link`
   ADD PRIMARY KEY (`idLink`);
 
 --
--- Indices de la tabla `palabras`
+-- Indexes for table `palabras`
 --
 ALTER TABLE `palabras`
   ADD PRIMARY KEY (`idPalabras`),
   ADD KEY `fkIdLink` (`idLink`);
 
 --
--- AUTO_INCREMENT de las tablas volcadas
+-- AUTO_INCREMENT for dumped tables
 --
 
 --
--- AUTO_INCREMENT de la tabla `dominio`
+-- AUTO_INCREMENT for table `dominio`
 --
 ALTER TABLE `dominio`
-  MODIFY `idDominio` int(11) NOT NULL AUTO_INCREMENT;
+  MODIFY `idDominio` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=7;
 
 --
--- AUTO_INCREMENT de la tabla `link`
+-- AUTO_INCREMENT for table `link`
 --
 ALTER TABLE `link`
-  MODIFY `idLink` int(11) NOT NULL AUTO_INCREMENT;
+  MODIFY `idLink` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=3;
 
 --
--- AUTO_INCREMENT de la tabla `palabras`
+-- AUTO_INCREMENT for table `palabras`
 --
 ALTER TABLE `palabras`
-  MODIFY `idPalabras` int(11) NOT NULL AUTO_INCREMENT;
+  MODIFY `idPalabras` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=3;
 
 --
--- Restricciones para tablas volcadas
+-- Constraints for dumped tables
 --
 
 --
--- Filtros para la tabla `link`
+-- Constraints for table `link`
 --
 ALTER TABLE `link`
   ADD CONSTRAINT `fkIdDominio` FOREIGN KEY (`idLink`) REFERENCES `dominio` (`idDominio`) ON DELETE CASCADE;
 
 --
--- Filtros para la tabla `palabras`
+-- Constraints for table `palabras`
 --
 ALTER TABLE `palabras`
   ADD CONSTRAINT `fkIdLink` FOREIGN KEY (`idLink`) REFERENCES `link` (`idLink`);

diff --git a/web/assets/js/analizador.js b/web/assets/js/analizador.js
@@ -13,7 +13,7 @@ function ejecutarAnalisis(){
 
     mostrarLoader();
 
-    iniciarProceso(dominio, tematica)
+    iniciarProceso(dominio.value, tematica.value)
 }
 
 function mostrarAlerta(mensaje){
@@ -27,12 +27,16 @@ function mostrarLoader(){
 
 function iniciarProceso(dominio, tematica) {
     $.ajax({
-        url: "/../../../linkDownload.py",
+        url: "../linkDownload.py",
         type: "POST",
-        cache: false,
+        data: {"url": dominio, "tema": tematica},
         success: function (response) {
-            $('#thenode').html(response);
-        }
+            alert(response)
+        },
+        error: function (xhr, ajaxOptions, thrownError) {
+            alert(xhr.status);
+            alert(thrownError);
+          }
     });
 
 }
diff --git a/web/test.py b/web/test.py
@@ -0,0 +1,47 @@
+#!C:/Users/sergi/AppData/Local/Programs/Python/Python37-32/python.exe
+print("Content-Type: text/html\n")
+from urllib.request import Request, urlopen
+import re
+import cgi
+from bs4 import BeautifulSoup
+import json
+import sys
+from lxml import html
+import requests
+
+
+#Abrir pagina web i adquirir texto
+url = "https://www.freecodecamp.org/news/my-first-python-project-converting-a-disorganized-text-file-into-a-neatly-structured-csv-file-21f4c6af502d"
+file = 'test'
+
+
+def urlToList(url, fileName):
+	req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+	webpage = urlopen(req).read()
+	soup = BeautifulSoup(webpage, features="lxml")
+
+	# elimina elementos script y style
+	for script in soup(["script", "style"]):
+	    script.extract()    # rip
+
+	# sacar solo texto
+	text = soup.body.get_text()
+
+	# El siguiente proceso es para separar en lineas y unirlo en un solo string
+	lines = (line.strip() for line in text.splitlines())
+	chunks = (phrase.strip() for line in lines for phrase in line.split("  "))	# break multi-headlines into a line each
+	text = '/n'.join(chunk for chunk in chunks if chunk)	# drop blank lines
+
+	#trabajaremos siempre con las palabras en minúsculas para evitar possibles errores de búsqueda
+	text = text.lower()
+	x={
+		"lista" : text.split()
+	}
+
+	#Passamos el texto como lista de palabras en un fichero
+	with open(fileName+'.json', 'w') as filehandle:
+	    json.dump(x, filehandle, ensure_ascii = False)
+
+	return True
+
+print("test.com")