背景
某天,网站响应速度超级慢,页面加载数据要等好几秒,业务员一直向我们技术反馈,经查找,发现有程序在频繁的访问我们的接口,也就是爬虫在抓取我们的数据,那我们就需要找到频繁访问的ip,服务器限制此ip访问。那么就需要在日志文件中找到此ip,下边是我写的简单代码,供大家参考。
代码
public class AnalyseLog {
public static final long OVERSIZE = 10485760;//文件限定大小10M,如果超出这个大小进行分片
public static final String SUFFIX = ".txt";//小文件后缀名
public static final String TEMPFILENAME = "zoneTemp";//临时文件夹名
public static final boolean ISSTARTZONE = false;//是否开启分片,默认为关
@Test
public void analyseLogUtils() {
DateTime start = DateTime.now();
//获取文件
String filePath = "D:\\estzone\\localhost_access_log.2020-10-16.txt";
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader br = null;
BufferedWriter bw = null;
long zoneNum = 0;//分片数量
List<File> zoneList = new ArrayList<>();//分片结果集
File file = new File(filePath);
if(!file.exists()||file.isDirectory()){
System.out.println("文件不存在或不是文件");
return;
}
//获取父文件路径
String parentPath = file.getParentFile().getAbsolutePath();
//文件大小
long length = file.length();
if(ISSTARTZONE){
if(length>OVERSIZE){
//计算分片数量
zoneNum = length%OVERSIZE ==0?length/OVERSIZE:length/OVERSIZE+1;
}
if(zoneNum==0){
zoneList.add(file);
}else{
List<File> files = AnalyseLog.zoneFile(file);
zoneList.addAll(files);
}
}else{
zoneList.add(file);
}
Map<String, Long> ipMap = new HashMap<>();//统计ip个数
Map<String, Long> urlMap = new HashMap<>();//统计url个数
Map<String,List<String>> ipUrlConRelate = new HashMap<>();//ip与地址关联
//循环读取分片
for (File temp : zoneList) {
try {
fis = new FileInputStream(temp);
isr = new InputStreamReader(fis);
br = new BufferedReader(isr);
while (br.readLine()!=null){
String zoneStrLine = br.readLine();
if(StringUtils.isNotBlank(zoneStrLine)){
String ipRegex = "((?:(?:25[0-5]|2[0-4]\\d|[01]?\\d?\\d)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d?\\d))";
String urlRegex = "\\s(/[\\w-./?%&=]*)?";
Map<String, Long> ipTempMap = AnalyseLog.regexTextNum(zoneStrLine, ipRegex);
//ip
String ip = "";
if(!ipTempMap.isEmpty()){
ip = (String) ipTempMap.keySet().toArray()[0];
for (Map.Entry<String, Long> ipEntry : ipTempMap.entrySet()) {
String key = ipEntry.getKey();
Long value = ipEntry.getValue();
if(ipMap.containsKey(key)){
Long aLong = ipMap.get(key);
ipMap.put(key,aLong+value);
}else{
ipMap.put(key,value);
}
}
}
//url
Map<String, Long> urlTempMap = AnalyseLog.regexTextNum(zoneStrLine, urlRegex);
String url ="";
if(!urlTempMap.isEmpty()){
url = (String) urlTempMap.keySet().toArray()[0];
for (Map.Entry<String, Long> urlEntry : urlTempMap.entrySet()) {
String key = urlEntry.getKey();
Long value = urlEntry.getValue();
if(urlMap.containsKey(key)){
Long aLong = urlMap.get(key);
urlMap.put(key,aLong+value);
}else{
urlMap.put(key,value);
}
}
}
//关联ip和url
if(StringUtils.isNotBlank(ip) && StringUtils.isNotBlank(url)){
if(ipUrlConRelate.containsKey(ip)){
List<String> stringList = ipUrlConRelate.get(ip);
if(!stringList.contains(url)){
stringList.add(url);
}
}else{
List<String> urlTemp = new ArrayList<>();
urlTemp.add(url);
ipUrlConRelate.put(ip,urlTemp);
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally {
try {
fis.close();
isr.close();
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//生成poi报表
XSSFWorkbook xssfWorkbook = new XSSFWorkbook();
XSSFSheet ipSheet = xssfWorkbook.createSheet("ip统计");
ipSheet.setDefaultColumnWidth(15);
int k = 0;
for (Map.Entry<String, Long> stringLongEntry : ipMap.entrySet()) {
XSSFRow row = ipSheet.createRow(k);
row.createCell(0).setCellValue(stringLongEntry.getKey());
row.createCell(1).setCellValue(stringLongEntry.getValue());
k++;
}
XSSFSheet urlSheet = xssfWorkbook.createSheet("url统计");
urlSheet.setDefaultColumnWidth(15);
k =0;
for (Map.Entry<String, Long> stringLongEntry : urlMap.entrySet()) {
XSSFRow row = urlSheet.createRow(k);
row.createCell(0).setCellValue(stringLongEntry.getKey());
row.createCell(1).setCellValue(stringLongEntry.getValue());
k++;
}
k = 0;
XSSFSheet ipurlSheet = xssfWorkbook.createSheet("ip-url统计");
ipurlSheet.setDefaultColumnWidth(15);
for (Map.Entry<String, List<String>> stringListEntry : ipUrlConRelate.entrySet()) {
XSSFRow row = ipurlSheet.createRow(k);
List<String> value = stringListEntry.getValue();
for (int j = 0; j < value.size(); j++) {
String s = value.get(j);
if(j == 0){
row.createCell(0).setCellValue(stringListEntry.getKey());
row.createCell(1).setCellValue(s);
}else{
XSSFRow rowSecond = ipurlSheet.createRow(k);
rowSecond.createCell(1).setCellValue(s);
}
k++;
}
k = k-1;
k++;
}
String path = FileSystemView.getFileSystemView().getHomeDirectory().getPath();
File resultFile = new File(path, "analyse.xlsx");
try {
xssfWorkbook.write(new FileOutputStream(resultFile));
} catch (IOException e) {
e.printStackTrace();
}
if(zoneNum!=0){
//删除临时文件
File tempFile = new File(file.getParentFile().getPath() + "/" + TEMPFILENAME);
String tempFilePath = tempFile.getPath();
String dosDelOrder = "cmd /c rd "+tempFilePath+" /s/q";
String linuxDelOrder = "sh -c rm -rf "+tempFilePath;
Runtime runtime = Runtime.getRuntime();
try {
Process exec = runtime.exec(dosDelOrder);
exec.waitFor();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
DateTime end = DateTime.now();
System.out.println("耗时:"+(end.getMillis()- start.getMillis())/1000+"秒");
}
/**
* 查询出文本中的ip和接口
* create by ly on 2020/10/16
* @Param [context]
* @return java.util.Map<java.lang.String,java.lang.Long>
*/
public static Map<String,Long> regexTextNum(String context, String Regex){
Pattern compile = Pattern.compile(Regex);
Matcher matcher = compile.matcher(context);
Map<String,Long> result = new HashMap<>();
while(matcher.find()){
if(StringUtils.isNotBlank(matcher.group())){
String group = matcher.group();
if(result.containsKey(group)){
result.put(group,result.get(group)+1);
}else{
result.put(group,1L);
}
}
}
return result;
}
/**
* 文件分片
* create by ly on 2020/10/19
* @Param [file]
* @return java.util.List<java.io.File>
*/
public static List<File> zoneFile(File file){
FileInputStream fis = null;
InputStreamReader isr = null;
BufferedReader br = null;
BufferedWriter bw = null;
StringBuffer stringBuffer = new StringBuffer();
List<File> zoneList = new ArrayList<>();//分片结果集
//进行分片
try {
fis = new FileInputStream(file);
isr = new InputStreamReader(fis);
br = new BufferedReader(isr);
String lineStr=null;//保证读取每一行
while((lineStr=br.readLine())!=null){
//(lineStr=br.readLine())!=null&&(lineStr=br.readLine())!=""这样写会导致读取两行
//按行读取
stringBuffer.append(lineStr).append("\n");
//查看读取量
if(stringBuffer.length()>OVERSIZE){
//超出限定,生成分片
//指定父文件下临时小文件
File tempFile = new File(file.getParentFile().getPath() + "/" + TEMPFILENAME);
tempFile.mkdirs();
File zoneFile = File.createTempFile("tomcat_log_zone_chapter",SUFFIX,tempFile);
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(zoneFile)));
bw.write(stringBuffer.toString());
zoneList.add(zoneFile);
bw.close();
//清空stringbuffer
stringBuffer.delete(0,stringBuffer.length());
}
lineStr="";
}
if(stringBuffer.length()!=0){
//指定父文件下临时小文件
File tempFile = new File(file.getParentFile().getPath() + "/" + TEMPFILENAME);
tempFile.mkdirs();
File zoneFile = File.createTempFile("tomcat_log_zone_chapter",SUFFIX,tempFile);
bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(zoneFile)));
bw.write(stringBuffer.toString());
zoneList.add(zoneFile);
bw.close();
//清空stringbuffer
stringBuffer.delete(0,stringBuffer.length());
}
}catch (IOException e){
e.printStackTrace();
}finally {
try {
fis.close();
isr.close();
br.close();
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return zoneList;
}
}
用java来分析大文件,效率可能完全不如python,如果对分析数据有很大的需求,建议使用python.
你的赞和关注是对我最大的肯定,希望大家多多支持,谢谢大家。