nhentai-favorites/scraper.py

169 lines
5.2 KiB
Python

from progress.spinner import PixelSpinner
from bs4 import BeautifulSoup
import yaml
import requests
import locale
import os
import json
import csv
if not os.path.isfile("set.yaml"):
with open('set.yaml', 'w') as f:
yaml.dump({"cookid": "", "useragent": ""}, f)
print("Please edit set.yaml")
exit()
with open('set.yaml', 'r') as f:
data = yaml.load(f, Loader=yaml.CLoader)
cookie = data["cookid"]
useragent = data["useragent"]
if cookie == "":
print("Please edit set.yaml")
exit()
# setting
URL = "https://nhentai.net/favorites/"
APIURL = "https://nhentai.net/api/gallery/"
table = [
["id", "name", "tags"]
]
now = 1
allnumbers = []
allnames = []
alltags = []
locate = locale.getdefaultlocale()[0]
if locate == "zh_TW":
language = {
"nodata": "沒有發現離線資料 抓取中請稍後...",
"nodata2": "抓取完畢",
"usedata": "使用離線資料",
"getdata": "抓取資料中...",
"403": "403 錯誤,可能被 cloudflare 阻擋,請檢查 cookie 是否正確",
"nologin": "未登入,請先登入",
"done": "完成"
}
else:
language = {
"nodata": "No offline data found, please wait a moment...",
"nodata2": "Done",
"usedata": "Use offline data",
"getdata": "Getting data...",
"403": "403 error, maby block by cloudflare , please check if the cookie is correct",
"nologin": "Not login, please login first",
"done": "Done"
}
def banner():
data = r" _ _ _ ___ _ \
_ __ ___| |__ _ __ | |_ __ _(_) / __\/_\/\ /\ \
| '_ \ / _ \ '_ \| '_ \| __/ _` | |_____ / _\ //_\\ \ / / \
| | | | __/ | | | | | | || (_| | |_____/ / / _ \ V / \
|_| |_|\___|_| |_|_| |_|\__\__,_|_| \/ \_/ \_/\_/ \
"
print(data)
def wtfcloudflare(url, method="get", data=None):
session = requests.Session()
session.headers = {
'Referer': "https://nhentai.net/login/",
'User-Agent': useragent,
'Cookie': cookie,
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
'Accept-Encoding': 'gzip, deflate',
}
if method == "get":
r = session.get(url)
elif method == "post":
r = session.post(url, data=data)
r.encoding = 'utf-8'
return r
def wtfcloudflare_t(url, method="get", data=None, useragent=None, cookie=None):
session = requests.Session()
session.headers = {
'Referer': "https://nhentai.net/login/",
'User-Agent': useragent,
'Cookie': cookie,
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6',
'Accept-Encoding': 'gzip, deflate',
}
if method == "get":
r = session.get(url, stream=True) # Add stream=True for large/binary files
elif method == "post":
r = session.post(url, data=data, stream=True) # stream=True for binary data
r.raise_for_status() # Check for request errors
return r
def check_pass():
res = wtfcloudflare("https://nhentai.net/")
if res.status_code == 403:
print(language["403"])
exit()
url_list = []
def build_id_list():
# Open and read the CSV file
with open('output.csv', 'r', encoding='utf-8-sig') as file:
reader = csv.DictReader(file)
# Print out the headers to debug the issue
print(reader.fieldnames) # This will show the exact header names
# Iterate over each row in the CSV
for row in reader:
# Check if 'id' exists in the row, and if not, print the row for debugging
if 'id' in row:
formatted_url = f"https://nhentai.net/g/{row['id']}/download"
url_list.append(formatted_url)
else:
print(f"Row without 'id': {row}")
banner()
check_pass()
build_id_list()
def get_torrents():
with open('set.yaml', 'r') as f:
data = yaml.load(f, Loader=yaml.CLoader)
cookie = data["cookid"]
useragent = data["useragent"]
if cookie == "":
print("Please edit set.yaml")
exit()
output_dir = "torrents"
os.makedirs(output_dir, exist_ok=True) # Create the directory if it doesn't exist
for url in url_list:
torrent_url = url
torrent_id = url.split('/')[4] # The ID is in the 4th segment of the URL
torrent_path = os.path.join(output_dir, f"{torrent_id}.torrent")
# Extract the ID from the URL for naming the file
torrent_id = url.split('/')[4] # The ID is in the 4th segment of the URL
# Skip downloading if the torrent file already exists
if os.path.exists(torrent_path):
print(f"Torrent file already exists: {torrent_path}")
continue
response = wtfcloudflare_t(torrent_url, useragent=useragent, cookie=cookie)
# Save the torrent file to disk
with open(torrent_path, 'wb') as torrent_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk: # Filter out keep-alive chunks
torrent_file.write(chunk)
print(f"Downloaded torrent: {torrent_path}")
if __name__ == '__main__':
get_torrents()