爬虫案例（java编写网络爬虫的简单案例分享）

2025-08-09 00:57:01 阅读 1002 评论 0

摘要：前言爬虫是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。现在比较适合写网络爬虫的应该是Python，今天给大家介绍一下java编写网络爬虫的简单案例。代码展示话不多说，直接上代码public class SpiderService {public final static String REQUEST_URL = "http://

前言

爬虫是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。现在比较适合写网络爬虫的应该是Python，今天给大家介绍一下java编写网络爬虫的简单案例。

代码展示

话不多说，直接上代码

public class SpiderService {public final static String REQUEST_URL = "http://news.sohu.com/";public final static String KEYWORD = "^.*疫情.*#34;;public static Integer count = 0;public static void spider(String url) throws Exception {URL requestUrl = new URL(url);URLConnection urlConnection = requestUrl.openConnection();BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));String input = bufferedReader.readLine();while(input !=null && count<1000 ){Pattern p = Pattern.compile(KEYWORD);Matcher m = p.matcher(input);while(m.find()) {count++;String matchContent = m.group();String url1 = getUrl(matchContent);if(!StringUtils.isEmpty(url1)){spider(url1);}else{String value = getValue(matchContent);System.out.println("网址url: "+url+",内容: "+value);}}input = bufferedReader.readLine();}}private static String getUrl(String matchContent) {String regex = "href=\"(http|https://.*?)[\"\\s]";Pattern p = Pattern.compile(regex);Matcher m = p.matcher(matchContent);String content = null;//会存在多个链接，可以用个集合进行存储，这边就简单展示只取一条if(m.find()){content = m.group(1);}return content;}private static String getValue(String matchContent){String regex = "[\\w,，.。\\f\\t\\v\\u4e00-\\u9fa5]*疫情[\\w,，.。\\f\\t\\v\\u4e00-\\u9fa5]*";Pattern p = Pattern.compile(regex);Matcher m = p.matcher(matchContent);String content = null;//会存在多个内容，可以用个集合进行存储，这边就简单展示只取一条if (m.find()){content = m.group();}return content;}public static void main(String[] args) throws Exception {spider(REQUEST_URL);}}