Webmagic备份站点静态镜像

有的网站内容很喜欢,但想要一个本地能无障碍访问的镜像怎么办?当然是wget

之前使用的如下,但是文件已存在的时候会出现各种问题,导致每次基本都要重下,对于一个有着巨量图片的站点实在是太难了

wget -r -c -p -np -nc -k -e robots=off -U=Mozilla -P /data/site -N http://$1

Webmagic默认不是干这个的,它是把页面内容抽取成结构化数据存储而不是直接把页面保存完事;不过看下文档也就是重写下PageProcessor和Pipeline的事;依然图样,保存图片还是需要开了线程池去下载

当然这样虽然减少了图片的重复下载,但并没有简单多少,以下仅作为一个记录,因为这个方案很快就放弃了

PageProcessor

关键是图片的路径部分,具体看注释

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/**
* Created by dor on 16-11-26.
*/
public class HtmlPageProcessor implements PageProcessor {
//专门存储图片链接
public final ConcurrentSet<String> FileLinks = new ConcurrentSet<>();

//站点url
private final String name;
//看文档,连接重试信息
private Site site = Site.me().setCharset("UTF-8")
.setRetryTimes(8).setSleepTime(4000).setTimeOut(20000);

public HtmlPageProcessor(String name) {
this.name = name;
}

@Override
public void process(Page page) {
//获取所有link及css、js和图片;以name开头和/开头的都要匹配下
page.addTargetRequests(page.getHtml().links().regex("((http|https)://" + name + "/[^?#]+)").all());
page.addTargetRequests(page.getHtml().$("link[href]", "href").regex("(^/[^?#]+)").all());
page.addTargetRequests(page.getHtml().$("link[href]", "href").regex("((http|https)://" + name + "/[^?#]+)").all());
page.addTargetRequests(page.getHtml().$("script[src]", "src").regex("(^/[^?#]+)").all());
page.addTargetRequests(page.getHtml().$("script[src]", "src").regex("((http|https)://" + name + "/[^?#]+)").all());

//不能把图片加到target否则还是每次重新下载,而且要两遍;保存到一个set里待页面处理完之后开始下载图片
//page.addTargetRequests(page.getHtml().$("img[src]", "src").regex("(^/[^?#]+)").all());
//page.addTargetRequests(page.getHtml().$("img[src]", "src").regex("((http|https)://" + name + "/[^?#]+)").all());
FileLinks.addAll(page.getHtml().$("img[src]", "src").regex("([\\w\\d_\\-.?&:/]+)").all());

String url = page.getRequest().getUrl();
StringBuilder replace = new StringBuilder(".");
String[] split = HtmlFileUtil.processUrl(url).split("/");
//应该对应的相对路径
if (split.length > 1) {
for (int i = 1; i < split.length - 1; i += 1) {
replace.append("/..");
}
}
if (url.endsWith("/")) replace.append("/..");

//只有html里才有替换的必要;而且注意getHtml方法会默认加上<html>标签,即使是css和js
//同样getRawText不适字节流,获取的图片均不能用,需要另外处理
if (url.endsWith(name) || url.endsWith("/") || HtmlFileUtil.isHtml(url)) {
page.putField("all", page.getHtml()
//末尾加上html,要不是目录
.replace("/\"", "/index.html\"")
.replace("\"/", "\"" + replace.toString() + "/")
.replace("\\?", "%3F")
.replace("(http|https)://" + name, replace.toString())
.replace("src=\"(http|https)://[^/]+/", "src=\"" + replace.toString() + "/file/"));
} else {
page.putField("all", page.getRawText());
}
}

@Override
public Site getSite() {
return site;
}
}

Pipeline

我会说这个是在FilePipeline上改的嘛,默认还要保存成key/value形式呢

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/**
* Created by dor on 16-11-26.
*/
@ThreadSafe
public class HtmlFilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());

public HtmlFilePipeline(String path) {
setPath(path);
}

@Override
public void process(ResultItems resultItems, Task task) {
String url = resultItems.getRequest().getUrl();
String relative = HtmlFileUtil.processUrl(url);
if (!HtmlFileUtil.isServer(relative)) {
String suffix = "index.html";
String path = this.path + PATH_SEPERATOR + relative + PATH_SEPERATOR;
if (relative.contains("/")) {
Pattern pattern = Pattern.compile("\\.[\\w\\d]+$");
Matcher matcher = pattern.matcher(relative);
if (matcher.find()) {
suffix = "";
path = this.path + PATH_SEPERATOR + relative;
}
}

try {
path = URLDecoder.decode(path, "UTF-8");
File file = getFile(path + suffix);
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
printWriter.println(entry.getValue());
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}
}

FileUtil

这样写不好,然而方便啊。。然后全重写了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/**
* Created by dor on 16-11-28.
*/
public class HtmlFileUtil {
private static final String separator = File.separator;
private static final Logger logger = LoggerFactory.getLogger(HtmlFileUtil.class);

public static boolean isHtml(String path) {
return path.endsWith("/") || path.endsWith(".htm") || path.endsWith(".html");
}

public static boolean isServer(String path) {
return path.endsWith(".php") || path.endsWith(".asp") || path.endsWith(".jsp") ||
path.endsWith(".action") || path.endsWith(".do");
}
//防止出现各种不适
public static String processUrl(String url) {
return url.replaceFirst("^(http|https)://", "")
.replaceAll("//", "/");
}

//保存文件
public static void saveFile(String basePath, String siteName, String url) {
String processUrl = processUrl(url);
String filePath = basePath + "/" + processUrl;
if (!processUrl.startsWith(siteName)) {
filePath = basePath + "/" + siteName + "/file" + processUrl.replaceFirst("[^/]+/", "/");
}

//替换下符合操作系统的分割符
File file = new File(filePath.replaceAll("/", separator));
logger.info(Thread.currentThread() + ": " + url + " -> " + file.getAbsolutePath());

if (!file.exists()) {
File dirs = new File(filePath.replaceFirst("/[^/]+$", "/").replaceAll("/", separator));
try {
if (!dirs.exists()) {
if (!dirs.mkdirs()) throw new RuntimeException("make dir error");
}
HttpClients.createDefault().execute(new HttpGet(url))
.getEntity().writeTo(new FileOutputStream(file));

} catch (ConnectTimeoutException e) {
logger.warn("connect timeout", e);
try {
Thread.sleep(1000);
} catch (InterruptedException ie) {
logger.warn("interrupted error", ie);
}
saveFile(basePath, siteName, url);
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}

//几个启动类及重载
//siteName网站名,请以http开头;siteName保存路径;thread进程数
public static void startGet(String siteName, String basePath, int thread) {
startGet(siteName, basePath, thread, null, 0);
}

public static void startGet(String siteName, String basePath,
int thread, String proxyAddr, int proxyPort) {
String[] split = siteName.split("//");
if (split.length < 2) throw new RuntimeException("site name error");
startGet(split[0] + "//", split[1], basePath, thread, proxyAddr, proxyPort);
}

public static void startGet(String scheme, String siteName, String basePath,
int thread, String proxyAddr, int proxyPort) {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
if (null != proxyAddr)
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy(proxyAddr, proxyPort)));

HtmlPageProcessor processor = new HtmlPageProcessor(siteName);
Spider.create(processor).addUrl(scheme + siteName)
.addPipeline(new HtmlFilePipeline(basePath))
.setDownloader(httpClientDownloader)
.thread(thread).run();

ExecutorService service = Executors.newFixedThreadPool(thread);
for (String link : processor.FileLinks) {
Runnable runnable = () -> {
if (link.startsWith("/") || link.startsWith("./")) {
saveFile(basePath, siteName, scheme + siteName + link);
} else {
saveFile(basePath, siteName, link);
}
};
service.execute(runnable);
}

service.shutdown();
}
}

MainClass

启动就没什么可说了直接参照官方代码

图片相关,只能重写个启动方法了

1
2
HtmlFileUtil.startGet("http://xxx.com", "/data/site", 8);
HtmlFileUtil.startGet("https://", "xxx.com", "/data/site", 8, "127.0.0.1", 7080);

总体上来说还是很简单易用的,每个组件看到就明白是做什么的了,效果也不错,没错慢的是网络!


Webmagic备份站点静态镜像
https://back.pub/post/spirder-webmagic-download/
作者
Dash
发布于
2016年11月28日
许可协议