通过wp-json接口抓取WordPress网站数据

WordPress网站一般会有开放的接口来获取网站元数据

参考：https://developer.wordpress.org/rest-api/reference/posts/

例如这个样例网站：https://www.sonymusic.com/

获取他们博客列表接口为：

https://www.sonymusic.com/wp-json/wp/v2/posts

一般读接口都是不保护的，你直接浏览器打开就能访问到。

我们一般抓取一个WordPress网站，只需要抓取他的posts、categories、tags即可。

我这里简单写了个Python脚本来执行需要数据的爬取

# -*- coding: utf-8 -*-

import cfscrape

import json

import os

from time import sleep

baseApiPath = "https://www.sonymusic.com/wp-json/wp/v2/"

def delAll(dir):

for f in os.listdir(dir):

os.remove(os.path.join(dir, f))

def pull(path, type, pageCount):

pageCount = pageCount + 1

delAll(path+type)

scraper = cfscrape.create_scraper()

for page in range(1, pageCount):

os.makedirs(os.path.dirname(path+type), exist_ok=True)

print("try to get :" + baseApiPath + type + "?page=" + str(page))

content = scraper.get(baseApiPath + type + "?page=" + str(page)).content

json_content = json.loads(content)

for item in json_content:

f = open(path + type + "/" + str(item['id']) + ".json", "a")

f.write(json.dumps(item))

f.close()

sleep(0.1)

if __name__ == "__main__":

pull("data/", "tags", 0)

pull("data/", "categories", 2)

pull("data/", "posts", 61)

# -*- coding: utf-8 -*- import cfscrape import json import os from time import sleep baseApiPath = "https://www.sonymusic.com/wp-json/wp/v2/" def delAll(dir): for f in os.listdir(dir): os.remove(os.path.join(dir, f)) def pull(path, type, pageCount): pageCount = pageCount + 1 delAll(path+type) scraper = cfscrape.create_scraper() for page in range(1, pageCount): os.makedirs(os.path.dirname(path+type), exist_ok=True) print("try to get :" + baseApiPath + type + "?page=" + str(page)) content = scraper.get(baseApiPath + type + "?page=" + str(page)).content json_content = json.loads(content) for item in json_content: f = open(path + type + "/" + str(item['id']) + ".json", "a") f.write(json.dumps(item)) f.close() sleep(0.1) if __name__ == "__main__": pull("data/", "tags", 0) pull("data/", "categories", 2) pull("data/", "posts", 61)

# -*- coding: utf-8 -*-

import cfscrape
import json
import os
from time import sleep


baseApiPath = "https://www.sonymusic.com/wp-json/wp/v2/"

def delAll(dir): 
    for f in os.listdir(dir):
        os.remove(os.path.join(dir, f))

def pull(path, type, pageCount):
    pageCount = pageCount + 1
    delAll(path+type)
    scraper = cfscrape.create_scraper()
    for page in range(1, pageCount):
        os.makedirs(os.path.dirname(path+type), exist_ok=True)
        print("try to get :" + baseApiPath + type + "?page=" + str(page))
        content = scraper.get(baseApiPath + type + "?page=" + str(page)).content
        json_content = json.loads(content)
        for item in json_content:
            f = open(path + type + "/" + str(item['id']) + ".json", "a")
            f.write(json.dumps(item))
            f.close()
            sleep(0.1)

if __name__ == "__main__":
    pull("data/", "tags", 0)
    pull("data/", "categories", 2)
    pull("data/", "posts", 61)

你可能会注意到pull第三个参数是一个数值，这个数值是要抓取的数据的页数，这个页数从哪里获取呢？

你直接用浏览器打开它对应接口数据，看他的返回头里面有个数值，他的key是x-wp-totalpages。

WordPress返回头里面会塞入接口对应数据的数量。