Skip to content

Commit

Permalink
feat: basic jina reader
Browse files Browse the repository at this point in the history
  • Loading branch information
hanfangyuan4396 committed Apr 19, 2024
1 parent c780c1e commit e1c0a62
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 160 deletions.
160 changes: 1 addition & 159 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,160 +1,2 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
config.json
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
# jina_sumary
# jina_sumary
ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT总结网页链接内容

微信公众号链接近期更新了,公众号卡片链接会进行校验,暂时没有找到合适的方法从公众号卡片链接获取到直接链接,但是此插件能总结公众号直接链接,和其他卡片链接(如小红书,哔哩哔哩等)

config.json 配置说明
```json
{
"jina_reader_base": "https://r.jina.ai", // jina reader链接,默认为https://r.jina.ai
"open_ai_api_base": "https://api.openai.com/v1", // chatgpt chat url
"open_ai_api_key": "sk-xxx", // chatgpt api key
"open_ai_model": "gpt-3.5-turbo", // chatgpt model
"max_words": 8000, // 网页链接内容的最大字数,防止超过最大输入token,使用字符串长度简单计数
"prompt": "我需要对下面的文本进行总结,总结输出包括以下三个部分:\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。" // 链接内容总结提示词
}
```
1 change: 1 addition & 0 deletions __init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .jina_sum import *
8 changes: 8 additions & 0 deletions config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"jina_reader_base": "https://r.jina.ai",
"open_ai_api_base": "https://api.openai.com/v1",
"open_ai_api_key": "sk-xxx",
"open_ai_model": "gpt-3.5-turbo",
"max_words": 8000,
"prompt": "我需要对下面的文本进行总结,总结输出包括以下三个部分:\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。"
}
133 changes: 133 additions & 0 deletions jina_sum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# encoding:utf-8
import json
import os
from urllib.parse import urlparse

import requests

import plugins
from bridge.context import ContextType
from bridge.reply import Reply, ReplyType
from common.log import logger
from plugins import *

@plugins.register(
name="JinaSum",
desire_priority=110,
hidden=False,
desc="Sum url link content with jina reader and llm",
version="0.1",
author="hanfangyuan",
)
class JinaSum(Plugin):

jina_reader_base = "https://r.jina.ai"
open_ai_api_base = "https://api.openai.com/v1"
open_ai_model = "gpt-3.5-turbo"
max_words = 8000
prompt = "我需要对下面引号内文档进行总结,总结输出包括以下三个部分:\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动\n\n"

def __init__(self):
super().__init__()
try:
self.config = super().load_config()
if not self.config:
self.config = self._load_config_template()
self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
self.open_ai_api_base = self.config.get("open_ai_api_base", self.open_ai_api_base)
self.open_ai_api_key = self.config.get("open_ai_api_key", "")
self.open_ai_model = self.config.get("open_ai_model", self.open_ai_model)
self.max_words = self.config.get("max_words", self.max_words)
self.prompt = self.config.get("prompt", self.prompt)
logger.info(f"[JinaSum] inited, config={self.config}")
self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
except Exception as e:
logger.error(f"[JinaSum] 初始化异常:{e}")
raise "[JinaSum] init failed, ignore "

def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
try:
context = e_context["context"]
content = context.content
if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
return

if not self._check_url(content):
logger.debug(f"[JinaSum] {content} not a url, skip")
return
if retry_count == 0:
logger.debug("[JinaSum] on_handle_context. content: %s" % content)
reply = Reply(ReplyType.TEXT, "🎉正在为您生成总结,请稍候...")
channel = e_context["channel"]
channel.send(reply, context)

target_url = content
jina_url = self._get_jina_url(target_url)
response = requests.get(jina_url, timeout=60)
response.raise_for_status()
target_url_content = response.text

openai_chat_url = self._get_openai_chat_url()
openai_headers = self._get_openai_headers()
openai_payload = self._get_openai_payload(target_url_content)
logger.debug(f"[JinaSum] openai_chat_url: {openai_chat_url}, openai_headers: {openai_headers}, openai_payload: {openai_payload}")
response = requests.post(openai_chat_url, headers=openai_headers, json=openai_payload, timeout=60)
response.raise_for_status()
result = response.json()['choices'][0]['message']['content']
reply = Reply(ReplyType.TEXT, result)
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS

except Exception as e:
if retry_count < 3:
logger.warning(f"[JinaSum] {str(e)}, retry {retry_count + 1}")
self.on_handle_context(e_context, retry_count + 1)
return

logger.exception(f"[JinaSum] {str(e)}")
reply = Reply(ReplyType.ERROR, "我暂时无法总结链接,请稍后再试")
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS

def get_help_text(self, verbose, **kwargs):
return f'使用jina reader和ChatGPT总结网页链接内容'

def _load_config_template(self):
logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
try:
plugin_config_path = os.path.join(self.path, "config.json.template")
if os.path.exists(plugin_config_path):
with open(plugin_config_path, "r", encoding="utf-8") as f:
plugin_conf = json.load(f)
return plugin_conf
except Exception as e:
logger.exception(e)

def _get_jina_url(self, target_url):
return self.jina_reader_base + "/" + target_url

def _get_openai_chat_url(self):
return self.open_ai_api_base + "/chat/completions"

def _get_openai_headers(self):
return {
'Authorization': f"Bearer {self.open_ai_api_key}",
'Host': urlparse(self.open_ai_api_base).netloc
}

def _get_openai_payload(self, target_url_content):
target_url_content = target_url_content[:self.max_words] # 通过字符串长度简单进行截断
sum_prompt = f"{self.prompt}\n\n'''{target_url_content}'''"
messages = [{"role": "user", "content": sum_prompt}]
payload = {
'model': self.open_ai_model,
'messages': messages
}
return payload

def _check_url(self, target_url: str):
# 微信官方做了校验,公众号卡片链接无法直接访问
if target_url.strip().startswith("https://mp.weixin.qq.com/s?__biz=") or target_url.strip().startswith("http://mp.weixin.qq.com/s?__biz="):
return False
# 简单校验是否是url
return target_url.strip().startswith("http://") or target_url.strip().startswith("https://")

0 comments on commit e1c0a62

Please sign in to comment.