Skip to content

Commit

Permalink
store http sessions in redis separately
Browse files Browse the repository at this point in the history
  • Loading branch information
pudo committed Aug 19, 2018
1 parent 43ccc8a commit 5c6e16a
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 24 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ data*
memorious.env
docker-compose.*
.git
.tox
.gitignore
__pycache__
14 changes: 8 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
DOCKER=docker run -v $(PWD)/dist:/ingestors/dist -ti alephdata/memorious

all: clean

Expand All @@ -9,15 +10,16 @@ clean:
find . -name '*.pyo' -exec rm -f {} +

build:
docker-compose build --pull
docker-compose run --rm worker memorious upgrade
docker build -t alephdata/memorious .

rebuild:
docker-compose build --pull --no-cache
docker-compose run --rm worker memorious upgrade
docker build --pull --no-cache -t alephdata/memorious .

test:
tox

shell:
docker-compose run --rm worker /bin/bash
$(DOCKER) /bin/bash

image:
docker build -t alephdata/memorious:latest .
docker build -t alephdata/memorious .
3 changes: 2 additions & 1 deletion memorious/helpers/ocr.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from tesserocr import PyTessBaseAPI # Requires Tesseract 0.3.4+


def read_word(image, whitelist=None, chars=None, spaces=False):
""" OCR a single word from an image. Useful for captchas.
Image should be pre-processed to remove noise etc. """
from tesserocr import PyTessBaseAPI
api = PyTessBaseAPI()
api.SetPageSegMode(8)
if whitelist is not None:
Expand All @@ -24,6 +24,7 @@ def read_word(image, whitelist=None, chars=None, spaces=False):

def read_char(image, whitelist=None):
""" OCR a single character from an image. Useful for captchas."""
from tesserocr import PyTessBaseAPI
api = PyTessBaseAPI()
api.SetPageSegMode(10)
if whitelist is not None:
Expand Down
1 change: 1 addition & 0 deletions memorious/helpers/ua.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import random


class UserAgent(object):

def __init__(self):
Expand Down
23 changes: 10 additions & 13 deletions memorious/logic/http.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import cgi
import json
import pickle
import codecs
from lxml import html, etree
from hashlib import sha1
from lxml import html, etree
from urllib.parse import unquote
from banal import hash_data, is_mapping
from urlnormalizer import normalize_url
from celestial import parse_mimetype, normalize_mimetype
Expand All @@ -12,10 +11,9 @@
from requests.structures import CaseInsensitiveDict
from datetime import datetime, timedelta

from six.moves.urllib.parse import unquote

from memorious import settings
from memorious.core import storage
from memorious.model.session import SessionState
from memorious.logic.mime import NON_HTML
from memorious.exc import ParseError
from memorious.helpers.ua import UserAgent
Expand All @@ -24,7 +22,7 @@


class ContextHttp(object):
STATE_SESSION = '_http_session'
STATE_SESSION = '_http'

def __init__(self, context):
self.context = context
Expand All @@ -33,11 +31,11 @@ def __init__(self, context):
if 'cache' in context.params:
self.cache = context.params.get('cache')

self.session = None
if self.STATE_SESSION in self.context.state:
session = self.context.state.get(self.STATE_SESSION)
session = codecs.decode(bytes(session, 'utf-8'), 'base64')
self.session = pickle.loads(session)
else:
key = self.context.state.get(self.STATE_SESSION)
self.session = SessionState.get(context.crawler, key)
if self.session is None:
self.reset()

def reset(self):
Expand Down Expand Up @@ -72,9 +70,8 @@ def rehash(self, data):
return ContextHttpResponse.deserialize(self, data)

def save(self):
session = pickle.dumps(self.session)
session = codecs.encode(session, 'base64').decode()
self.context.state[self.STATE_SESSION] = session
key = SessionState.get(self.context.crawler, self.session)
self.context.state[self.STATE_SESSION] = key


class ContextHttpResponse(object):
Expand Down
2 changes: 2 additions & 0 deletions memorious/model/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from banal.dicts import clean_dict
from datetime import datetime, date

from memorious.core import connect_redis
Expand Down Expand Up @@ -44,6 +45,7 @@ def default(self, obj):
def dump_json(data):
if data is None:
return
data = clean_dict(data)
return JSONEncoder().encode(data)


Expand Down
2 changes: 0 additions & 2 deletions memorious/model/queue.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import random
import logging
from collections import deque
from datetime import datetime, timedelta
Expand Down Expand Up @@ -35,7 +34,6 @@ def serialize_task_data(cls, stage, state, data, delay):
@classmethod
def tasks(cls):
queues = [make_key('queue', c, s) for c, s in manager.stages]
random.shuffle(queues)
while True:
task_data_tuple = cls.conn.blpop(queues)
# blpop blocks until it finds something. But fakeredis has no
Expand Down
34 changes: 34 additions & 0 deletions memorious/model/session.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import pickle
import codecs
import logging
from hashlib import sha1

from memorious.model.common import Base
from memorious.util import make_key

log = logging.getLogger(__name__)


class SessionState(Base):
"""An HTTP session state."""

@classmethod
def save(cls, crawler, session):
session = pickle.dumps(session)
session = codecs.encode(session, 'base64')
key = sha1(session).hexdigest()[:15]
key = make_key(crawler, "session", key)
cls.conn.set(key, session, ex=84600)
return key

@classmethod
def get(cls, crawler, key):
value = cls.conn.get(make_key(crawler, "session", key))
if value is not None:
session = codecs.decode(bytes(value, 'utf-8'), 'base64')
return pickle.loads(session)

@classmethod
def delete(cls, crawler):
for key in cls.conn.scan_iter(make_key(crawler, "session", "*")):
cls.conn.delete(key)
3 changes: 1 addition & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ envlist = py37
skipsdist = True

[testenv]
commands = pip install -q .[dev]
pip install -q pyicu
commands = pip install -q .[dev] pyicu
pytest --cov=memorious --cov-report term-missing

[pytest]
Expand Down

0 comments on commit 5c6e16a

Please sign in to comment.