Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
hakiour committed Oct 6, 2019
2 parents 1498806 + e6119fd commit 0059ed4
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 9 deletions.
12 changes: 6 additions & 6 deletions linkDownload.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,21 @@ def urlToList(url, fileName, dominio):


def readLinks():

rutaGoogle = 'http://www.google.com/search?btnG=1&q=site%3A'
datosForm = cgi.FieldStorage()

if datosForm:
print("adasd")
url = datosForm["url"]
url = datosForm["url"].value
arrayUrl = (url.split("//"))
if len(arrayUrl) == 2:
url = arrayUrl[1]
else:
print('Error url')
return 0

tema = datosForm["tema"]
print(url)

tema = datosForm["tema"].value

pagina = requests.get(rutaGoogle + url)
tree = html.fromstring(pagina.content)
Expand All @@ -83,12 +83,12 @@ def readLinks():
urlToList(link, "test"+str(i), url)
i+=1


else:
print('Error input')

#url = "https://www.elmundo.es/pais-vasco/2019/07/10/5d25f9af21efa0c0578b456f.html"
#urlToList(url,file)

#readLinks()
print("asd")
readLinks()
#map reduce para contar palabras
6 changes: 3 additions & 3 deletions web/assets/js/analizador.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ function ejecutarAnalisis(){

mostrarLoader();

iniciarProceso(dominio, tematica)
iniciarProceso(dominio.value, tematica.value)
}

function mostrarAlerta(mensaje){
Expand All @@ -27,9 +27,9 @@ function mostrarLoader(){

function iniciarProceso(dominio, tematica) {
$.ajax({
url: "./linkDownload.py",
url: "../linkDownload.py",
type: "POST",
data:{ "url":dominio, "tema": tematica},
data: {"url": dominio, "tema": tematica},
success: function (response) {
alert(response)
},
Expand Down
47 changes: 47 additions & 0 deletions web/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!C:/Users/sergi/AppData/Local/Programs/Python/Python37-32/python.exe
print("Content-Type: text/html\n")
from urllib.request import Request, urlopen
import re
import cgi
from bs4 import BeautifulSoup
import json
import sys
from lxml import html
import requests


#Abrir pagina web i adquirir texto
url = "https://www.freecodecamp.org/news/my-first-python-project-converting-a-disorganized-text-file-into-a-neatly-structured-csv-file-21f4c6af502d"
file = 'test'


def urlToList(url, fileName):
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
soup = BeautifulSoup(webpage, features="lxml")

# elimina elementos script y style
for script in soup(["script", "style"]):
script.extract() # rip

# sacar solo texto
text = soup.body.get_text()

# El siguiente proceso es para separar en lineas y unirlo en un solo string
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # break multi-headlines into a line each
text = '/n'.join(chunk for chunk in chunks if chunk) # drop blank lines

#trabajaremos siempre con las palabras en minúsculas para evitar possibles errores de búsqueda
text = text.lower()
x={
"lista" : text.split()
}

#Passamos el texto como lista de palabras en un fichero
with open(fileName+'.json', 'w') as filehandle:
json.dump(x, filehandle, ensure_ascii = False)

return True

print("test.com")

0 comments on commit 0059ed4

Please sign in to comment.