fix: 同步知识库,无法获取内容

This commit is contained in:
shaohuzhang1 2024-02-29 15:14:53 +08:00
parent 8450b3598c
commit 22c319a2bf
2 changed files with 6 additions and 2 deletions

View File

@ -7,7 +7,7 @@ from typing import List, Set
import requests import requests
import html2text as ht import html2text as ht
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, ParseResult from urllib.parse import urljoin, urlparse, ParseResult, urlsplit, parse_qs
requests.packages.urllib3.disable_warnings() requests.packages.urllib3.disable_warnings()
@ -60,7 +60,11 @@ class Fork:
def __init__(self, base_fork_url: str, selector_list: List[str]): def __init__(self, base_fork_url: str, selector_list: List[str]):
self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.') self.base_fork_url = urljoin(base_fork_url if base_fork_url.endswith("/") else base_fork_url + '/', '.')
parsed = urlsplit(base_fork_url)
query = parsed.query
self.base_fork_url = self.base_fork_url[:-1] self.base_fork_url = self.base_fork_url[:-1]
if query is not None and len(query) > 0:
self.base_fork_url = self.base_fork_url + '?' + query
self.selector_list = [selector for selector in selector_list if selector is not None and len(selector) > 0] self.selector_list = [selector for selector in selector_list if selector is not None and len(selector) > 0]
self.urlparse = urlparse(self.base_fork_url) self.urlparse = urlparse(self.base_fork_url)
self.base_url = ParseResult(scheme=self.urlparse.scheme, netloc=self.urlparse.netloc, path='', params='', self.base_url = ParseResult(scheme=self.urlparse.scheme, netloc=self.urlparse.netloc, path='', params='',

View File

@ -26,7 +26,7 @@ tiktoken = "^0.5.1"
qianfan = "^0.1.1" qianfan = "^0.1.1"
pycryptodome = "^3.19.0" pycryptodome = "^3.19.0"
beautifulsoup4 = "^4.12.2" beautifulsoup4 = "^4.12.2"
html2text = "^2020.1.16" html2text = "^2024.2.26"
[build-system] [build-system]