Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
adriacarrasquilla committed Oct 6, 2019
2 parents 906e3c5 + e6119fd commit f791247
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 41 deletions.
25 changes: 14 additions & 11 deletions linkDownload.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#!C:/Users/sergi/AppData/Local/Programs/Python/Python37-32/python.exe
print("Content-Type: text/html\n")

from urllib.request import Request, urlopen
import re
Expand Down Expand Up @@ -52,20 +52,21 @@ def urlToList(url, dominio):


def readLinks():

rutaGoogle = 'http://www.google.com/search?btnG=1&q=site%3A'
datosForm = cgi.FieldStorage()

if datosForm:
url = datosForm["url"]
url = datosForm["url"].value
arrayUrl = (url.split("//"))
if len(arrayUrl) == 2:
url = arrayUrl[1]
else:
print('Error url')
return 0

tema = datosForm["tema"]
print(url)

tema = datosForm["tema"].value

pagina = requests.get(rutaGoogle + url)
tree = html.fromstring(pagina.content)
Expand All @@ -85,14 +86,16 @@ def readLinks():

for link in arrayUrls[:-1]:
print("reading link: "+ link)
dicPalabrasLink = urlToList(link, urlSrc)
busqueda["LINKS"][link]=dicPalabrasLink[link] #añadimos entrada de url: palabras.

return busqueda

urlToList(link, "test"+str(i), url)
i+=1


else:
print('Error input')
return None

#print(readLinks())
#url = "https://www.elmundo.es/pais-vasco/2019/07/10/5d25f9af21efa0c0578b456f.html"
#urlToList(url,file)

readLinks()
#map reduce para contar palabras
78 changes: 53 additions & 25 deletions terrassahash.sql
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
-- phpMyAdmin SQL Dump
-- version 4.9.0.1
-- version 4.8.5
-- https://www.phpmyadmin.net/
--
-- Servidor: 127.0.0.1
-- Tiempo de generación: 06-10-2019 a las 00:41:32
-- Versión del servidor: 10.4.6-MariaDB
-- Versión de PHP: 7.3.9
-- Host: 127.0.0.1
-- Generation Time: Oct 06, 2019 at 02:02 AM
-- Server version: 10.1.38-MariaDB
-- PHP Version: 7.3.2

SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET AUTOCOMMIT = 0;
Expand All @@ -19,28 +19,40 @@ SET time_zone = "+00:00";
/*!40101 SET NAMES utf8mb4 */;

--
-- Base de datos: `terrassahash`
-- Database: `terrassahash`
--

-- --------------------------------------------------------

--
-- Estructura de tabla para la tabla `dominio`
-- Table structure for table `dominio`
--

CREATE TABLE `dominio` (
`idDominio` int(11) NOT NULL,
`fecha` timestamp NOT NULL DEFAULT current_timestamp() ON UPDATE current_timestamp(),
`fecha` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
`ipUsuario` varchar(45) NOT NULL,
`nombreDominio` varchar(100) NOT NULL,
`tematica` varchar(100) NOT NULL,
`created` timestamp NOT NULL DEFAULT current_timestamp()
`created` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

--
-- Dumping data for table `dominio`
--

INSERT INTO `dominio` (`idDominio`, `fecha`, `ipUsuario`, `nombreDominio`, `tematica`, `created`) VALUES
(1, '2019-10-05 22:44:48', '192.168.1.9', 'lavanguardia.com', 'racismo', '2019-10-05 22:44:48'),
(2, '2019-10-05 22:45:07', '192.168.1.9', 'lavanguardia.com', 'racismo', '2019-10-05 22:45:07'),
(3, '2019-10-05 22:46:48', '192.168.1.9', 'facebook.com', 'racismo', '2019-10-05 22:46:48'),
(4, '2019-10-05 22:46:48', '192.168.1.9', 'twitter.com', 'racismo', '2019-10-05 22:46:48'),
(5, '2019-10-05 22:46:48', '192.168.1.9', 'mamuthack.com', 'racismo', '2019-10-05 22:46:48'),
(6, '2019-10-05 22:46:48', '192.168.1.9', 'https://www.forocoches.com/foro/showthread.php?t=6539653&page=2', 'racismo', '2019-10-05 22:46:48');

-- --------------------------------------------------------

--
-- Estructura de tabla para la tabla `link`
-- Table structure for table `link`
--

CREATE TABLE `link` (
Expand All @@ -49,10 +61,18 @@ CREATE TABLE `link` (
`indice` float NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

--
-- Dumping data for table `link`
--

INSERT INTO `link` (`idLink`, `idDominio`, `indice`) VALUES
(1, 1, 1),
(2, 2, 2);

-- --------------------------------------------------------

--
-- Estructura de tabla para la tabla `palabras`
-- Table structure for table `palabras`
--

CREATE TABLE `palabras` (
Expand All @@ -64,62 +84,70 @@ CREATE TABLE `palabras` (
) ENGINE=InnoDB DEFAULT CHARSET=latin1;

--
-- Índices para tablas volcadas
-- Dumping data for table `palabras`
--

INSERT INTO `palabras` (`idPalabras`, `Palabra`, `tipo`, `numeroVeces`, `idLink`) VALUES
(1, 'puto', '1', 23, 1),
(2, 'negrata', '2', 12, 2);

--
-- Indexes for dumped tables
--

--
-- Indices de la tabla `dominio`
-- Indexes for table `dominio`
--
ALTER TABLE `dominio`
ADD PRIMARY KEY (`idDominio`);

--
-- Indices de la tabla `link`
-- Indexes for table `link`
--
ALTER TABLE `link`
ADD PRIMARY KEY (`idLink`);

--
-- Indices de la tabla `palabras`
-- Indexes for table `palabras`
--
ALTER TABLE `palabras`
ADD PRIMARY KEY (`idPalabras`),
ADD KEY `fkIdLink` (`idLink`);

--
-- AUTO_INCREMENT de las tablas volcadas
-- AUTO_INCREMENT for dumped tables
--

--
-- AUTO_INCREMENT de la tabla `dominio`
-- AUTO_INCREMENT for table `dominio`
--
ALTER TABLE `dominio`
MODIFY `idDominio` int(11) NOT NULL AUTO_INCREMENT;
MODIFY `idDominio` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=7;

--
-- AUTO_INCREMENT de la tabla `link`
-- AUTO_INCREMENT for table `link`
--
ALTER TABLE `link`
MODIFY `idLink` int(11) NOT NULL AUTO_INCREMENT;
MODIFY `idLink` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=3;

--
-- AUTO_INCREMENT de la tabla `palabras`
-- AUTO_INCREMENT for table `palabras`
--
ALTER TABLE `palabras`
MODIFY `idPalabras` int(11) NOT NULL AUTO_INCREMENT;
MODIFY `idPalabras` int(11) NOT NULL AUTO_INCREMENT, AUTO_INCREMENT=3;

--
-- Restricciones para tablas volcadas
-- Constraints for dumped tables
--

--
-- Filtros para la tabla `link`
-- Constraints for table `link`
--
ALTER TABLE `link`
ADD CONSTRAINT `fkIdDominio` FOREIGN KEY (`idLink`) REFERENCES `dominio` (`idDominio`) ON DELETE CASCADE;

--
-- Filtros para la tabla `palabras`
-- Constraints for table `palabras`
--
ALTER TABLE `palabras`
ADD CONSTRAINT `fkIdLink` FOREIGN KEY (`idLink`) REFERENCES `link` (`idLink`);
Expand Down
14 changes: 9 additions & 5 deletions web/assets/js/analizador.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ function ejecutarAnalisis(){

mostrarLoader();

iniciarProceso(dominio, tematica)
iniciarProceso(dominio.value, tematica.value)
}

function mostrarAlerta(mensaje){
Expand All @@ -27,12 +27,16 @@ function mostrarLoader(){

function iniciarProceso(dominio, tematica) {
$.ajax({
url: "/../../../linkDownload.py",
url: "../linkDownload.py",
type: "POST",
cache: false,
data: {"url": dominio, "tema": tematica},
success: function (response) {
$('#thenode').html(response);
}
alert(response)
},
error: function (xhr, ajaxOptions, thrownError) {
alert(xhr.status);
alert(thrownError);
}
});

}
47 changes: 47 additions & 0 deletions web/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!C:/Users/sergi/AppData/Local/Programs/Python/Python37-32/python.exe
print("Content-Type: text/html\n")
from urllib.request import Request, urlopen
import re
import cgi
from bs4 import BeautifulSoup
import json
import sys
from lxml import html
import requests


#Abrir pagina web i adquirir texto
url = "https://www.freecodecamp.org/news/my-first-python-project-converting-a-disorganized-text-file-into-a-neatly-structured-csv-file-21f4c6af502d"
file = 'test'


def urlToList(url, fileName):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, features="lxml")

# elimina elementos script y style
for script in soup(["script", "style"]):
script.extract() # rip

# sacar solo texto
text = soup.body.get_text()

# El siguiente proceso es para separar en lineas y unirlo en un solo string
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # break multi-headlines into a line each
text = '/n'.join(chunk for chunk in chunks if chunk) # drop blank lines

#trabajaremos siempre con las palabras en minúsculas para evitar possibles errores de búsqueda
text = text.lower()
x={
"lista" : text.split()
}

#Passamos el texto como lista de palabras en un fichero
with open(fileName+'.json', 'w') as filehandle:
json.dump(x, filehandle, ensure_ascii = False)

return True

print("test.com")

0 comments on commit f791247

Please sign in to comment.