Commit c8354490 authored by nanahira's avatar nanahira

save

parent b1186e44
/home/nanahira/zeeai/doi-fetch
\ No newline at end of file
version: '2.4'
services:
novnc:
restart: always
image: theasp/novnc:latest
environment:
RUN_XTERM: 'no'
DISPLAY_WIDTH: 1920
DISPLAY_HEIGHT: 1080
ports:
- '{{vnc_port}}:8080'
doi-fetch:
restart: always
build: './data/src'
volumes:
- ./data/src/data:/usr/src/app/data
- ./output:/usr/src/app/output
environment:
DISPLAY: novnc:0.0
OFFSET: '{{offset}}'
MAX_PAGES: '{{max_page}}'
command:
- npm
- start
FROM python:3
RUN pip install --no-cache requests
WORKDIR /app
COPY ./app/ ./
CMD ["python", "get_link.py"]
import requests
import os
import json
import time
import random
def fetch_and_save_data(page):
"""
根据page值获取数据并保存为JSON文件。
"""
url = 'https://pubscholar.cn/hky/open/resources/api/v1/articles'
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=UTF-8',
'Cookie': 'XSRF-TOKEN=40ca2d66-34b7-4e1f-b85a-3bd420ba4d50',
'Origin': 'https://pubscholar.cn',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Mobile Safari/537.36',
'X-XSRF-TOKEN': '40ca2d66-34b7-4e1f-b85a-3bd420ba4d50',
'sec-ch-ua': '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
'sec-ch-ua-mobile': '?1',
'sec-ch-ua-platform': '"Android"'
}
data = {
"page": page,
"size": 10,
"order_field": "date",
"order_direction": "desc",
"user_id": "8b97c08abdc148597255ae8506d55555"
}
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
response_data = response.json()
total = response_data.get('total', 0)
os.makedirs('./result', exist_ok=True)
with open(f'./result/page_{page}.json', 'w', encoding='utf-8') as json_file:
json.dump(response_data, json_file, ensure_ascii=False, indent=4)
print(f"数据已保存到 page_{page}.json")
# 检查是否还有更多数据可以请求
if total > page * 10:
return True # 存在更多数据,应继续请求
else:
return False # 无更多数据,停止请求
else:
print(f"请求失败,状态码:{response.status_code}")
return False # 请求失败,不继续请求
def main():
page_file_path = './result/0.current_page.txt'
# 尝试读取当前page值
if os.path.exists(page_file_path):
with open(page_file_path, 'r') as file:
page = int(file.read().strip())
else:
page = int(os.getenv('START_PAGE', 0)) + 1
# 获取并保存数据,如果有更多数据,继续请求
continue_fetching = True
while continue_fetching:
continue_fetching = fetch_and_save_data(page)
if continue_fetching:
page += 1 # 递增page值
with open(page_file_path, 'w') as file:
file.write(str(page)) # 保存新的page值
# 随机延迟,例如:5到10秒之间
delay = random.uniform(0, 0.5)
print(f"等待 {delay:.2f} 秒后继续下一个请求...")
time.sleep(delay)
if __name__ == '__main__':
main()
version: '2.4'
services:
mypaper-listing:
restart: always
build: ./data/build
volumes:
- ./result:/app/result
environment:
START_PAGE: '{{offset}}'
auth = "plain[passwd=/etc/ocserv/ocpasswd]"
listen-host-is-dyndns = true
tcp-port = {{port}}
udp-port = {{port}}
run-as-user = nobody
run-as-group = daemon
socket-file = /run/ocserv.socket
server-cert = /etc/ssl/railgun/certs/fullchain.pem
server-key = /etc/ssl/railgun/certs/privkey.pem
dh-params = /etc/ssl/railgun/dhparam.pem
ca-cert = /etc/ssl/certs/ssl-cert-snakeoil.pem
isolate-workers = true
stats-report-time = 360
server-stats-reset-time = 604800
keepalive = 32400
dpd = 90
mobile-dpd = 1800
switch-to-tcp-timeout = 25
try-mtu-discovery = true
cert-user-oid = 0.9.2342.19200300.100.1.1
compression = true
no-compress-limit = 256
tls-priorities = "NORMAL:%SERVER_PRECEDENCE:%COMPAT:-RSA:-VERS-SSL3.0:-ARCFOUR-128"
auth-timeout = 240
idle-timeout = 1200
mobile-idle-timeout = 1800
min-reauth-time = 300
max-ban-score = 80
ban-reset-time = 300
cookie-timeout = 86400
persistent-cookies = true
deny-roaming = false
rekey-time = 172800
rekey-method = ssl
use-occtl = true
pid-file = /run/ocserv.pid
device = ocs1
predictable-ips = true
ipv4-network = {{network}}
dns = {{dns}}
ping-leases = false
{% for net in (lookup('template', './result.yaml') | from_yaml).chnrouter %}
route = {{net}}
{% endfor %}
cisco-client-compat = true
dtls-legacy = true
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment