feat: basic jina reader

hanfangyuan4396 · Apr 19, 2024 · e1c0a62 · e1c0a62
1 parent c780c1e
commit e1c0a62
Show file tree

Hide file tree

Showing 5 changed files with 159 additions and 160 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,160 +1,2 @@
-# Byte-compiled / optimized / DLL files
 __pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
+config.json
diff --git a/README.md b/README.md
@@ -1 +1,16 @@
-# jina_sumary
+# jina_sumary
+ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT总结网页链接内容
+
+微信公众号链接近期更新了，公众号卡片链接会进行校验，暂时没有找到合适的方法从公众号卡片链接获取到直接链接，但是此插件能总结公众号直接链接，和其他卡片链接(如小红书，哔哩哔哩等)
+
+config.json 配置说明
+```json
+{
+  "jina_reader_base": "https://r.jina.ai",           // jina reader链接，默认为https://r.jina.ai
+  "open_ai_api_base": "https://api.openai.com/v1",   // chatgpt chat url
+  "open_ai_api_key":  "sk-xxx",                      // chatgpt api key
+  "open_ai_model": "gpt-3.5-turbo",                  // chatgpt model
+  "max_words": 8000,                                 // 网页链接内容的最大字数，防止超过最大输入token，使用字符串长度简单计数
+  "prompt": "我需要对下面的文本进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。"                           // 链接内容总结提示词
+}
+```
diff --git a/__init__.py b/__init__.py
@@ -0,0 +1 @@
+from .jina_sum import *
diff --git a/config.json.template b/config.json.template
@@ -0,0 +1,8 @@
+{
+  "jina_reader_base": "https://r.jina.ai",
+  "open_ai_api_base": "https://api.openai.com/v1",
+  "open_ai_api_key":  "sk-xxx",
+  "open_ai_model": "gpt-3.5-turbo",
+  "max_words": 8000,
+  "prompt": "我需要对下面的文本进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动。"
+}
diff --git a/jina_sum.py b/jina_sum.py
@@ -0,0 +1,133 @@
+# encoding:utf-8
+import json
+import os
+from urllib.parse import urlparse
+
+import requests
+
+import plugins
+from bridge.context import ContextType
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from plugins import *
+
+@plugins.register(
+    name="JinaSum",
+    desire_priority=110,
+    hidden=False,
+    desc="Sum url link content with jina reader and llm",
+    version="0.1",
+    author="hanfangyuan",
+)
+class JinaSum(Plugin):
+
+    jina_reader_base = "https://r.jina.ai"
+    open_ai_api_base = "https://api.openai.com/v1"
+    open_ai_model = "gpt-3.5-turbo"
+    max_words = 8000
+    prompt = "我需要对下面引号内文档进行总结，总结输出包括以下三个部分：\n📖 一句话总结\n🔑 关键要点,用数字序号列出3-5个文章的核心内容\n🏷 标签: #xx #xx\n请使用emoji让你的表达更生动\n\n"
+
+    def __init__(self):
+        super().__init__()
+        try:
+            self.config = super().load_config()
+            if not self.config:
+                self.config = self._load_config_template()
+            self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
+            self.open_ai_api_base = self.config.get("open_ai_api_base", self.open_ai_api_base)
+            self.open_ai_api_key = self.config.get("open_ai_api_key", "")
+            self.open_ai_model = self.config.get("open_ai_model", self.open_ai_model)
+            self.max_words = self.config.get("max_words", self.max_words)
+            self.prompt = self.config.get("prompt", self.prompt)
+            logger.info(f"[JinaSum] inited, config={self.config}")
+            self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
+        except Exception as e:
+            logger.error(f"[JinaSum] 初始化异常：{e}")
+            raise "[JinaSum] init failed, ignore "
+
+    def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
+        try:
+            context = e_context["context"]
+            content = context.content
+            if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
+                return
+
+            if not self._check_url(content):
+                logger.debug(f"[JinaSum] {content} not a url, skip")
+                return
+            if retry_count == 0:
+                logger.debug("[JinaSum] on_handle_context. content: %s" % content)
+                reply = Reply(ReplyType.TEXT, "🎉正在为您生成总结，请稍候...")
+                channel = e_context["channel"]
+                channel.send(reply, context)
+
+            target_url = content
+            jina_url = self._get_jina_url(target_url)
+            response = requests.get(jina_url, timeout=60)
+            response.raise_for_status()
+            target_url_content = response.text
+
+            openai_chat_url = self._get_openai_chat_url()
+            openai_headers = self._get_openai_headers()
+            openai_payload = self._get_openai_payload(target_url_content)
+            logger.debug(f"[JinaSum] openai_chat_url: {openai_chat_url}, openai_headers: {openai_headers}, openai_payload: {openai_payload}")
+            response = requests.post(openai_chat_url, headers=openai_headers, json=openai_payload, timeout=60)
+            response.raise_for_status()
+            result = response.json()['choices'][0]['message']['content']
+            reply = Reply(ReplyType.TEXT, result)
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+        except Exception as e:
+            if retry_count < 3:
+                logger.warning(f"[JinaSum] {str(e)}, retry {retry_count + 1}")
+                self.on_handle_context(e_context, retry_count + 1)
+                return
+
+            logger.exception(f"[JinaSum] {str(e)}")
+            reply = Reply(ReplyType.ERROR, "我暂时无法总结链接，请稍后再试")
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+    def get_help_text(self, verbose, **kwargs):
+        return f'使用jina reader和ChatGPT总结网页链接内容'
+
+    def _load_config_template(self):
+        logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
+        try:
+            plugin_config_path = os.path.join(self.path, "config.json.template")
+            if os.path.exists(plugin_config_path):
+                with open(plugin_config_path, "r", encoding="utf-8") as f:
+                    plugin_conf = json.load(f)
+                    return plugin_conf
+        except Exception as e:
+            logger.exception(e)
+
+    def _get_jina_url(self, target_url):
+        return self.jina_reader_base + "/" + target_url
+
+    def _get_openai_chat_url(self):
+        return self.open_ai_api_base + "/chat/completions"
+
+    def _get_openai_headers(self):
+        return {
+            'Authorization': f"Bearer {self.open_ai_api_key}",
+            'Host': urlparse(self.open_ai_api_base).netloc
+        }
+
+    def _get_openai_payload(self, target_url_content):
+        target_url_content = target_url_content[:self.max_words] # 通过字符串长度简单进行截断
+        sum_prompt = f"{self.prompt}\n\n'''{target_url_content}'''"
+        messages = [{"role": "user", "content": sum_prompt}]
+        payload = {
+            'model': self.open_ai_model,
+            'messages': messages
+        }
+        return payload
+
+    def _check_url(self, target_url: str):
+        # 微信官方做了校验，公众号卡片链接无法直接访问
+        if target_url.strip().startswith("https://mp.weixin.qq.com/s?__biz=") or target_url.strip().startswith("http://mp.weixin.qq.com/s?__biz="):
+            return False
+        # 简单校验是否是url
+        return target_url.strip().startswith("http://") or target_url.strip().startswith("https://")