Skip to content

Commit 002ae2e

Browse files
committed
Persist Resolution Errors and Blocklisted results
This should ensure that for each input line we get a result into the DB
1 parent ebcc448 commit 002ae2e

File tree

6 files changed

+144
-74
lines changed

6 files changed

+144
-74
lines changed

src/main/java/de/rub/nds/crawler/constant/JobStatus.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
public enum JobStatus {
1212
/** Job is waiting to be executed. */
1313
TO_BE_EXECUTED(false),
14+
/** The domain was not resolvable. An empty result was written to DB. */
15+
UNRESOLVABLE(true),
16+
/** The domain was denylisted. An empty result was written to DB. */
17+
DENYLISTED(true),
1418
/** Job was successfully executed. Result was written to db. */
1519
SUCCESS(false),
1620
/** Job was successfully executed. No result was returned. An empty result was written to DB. */

src/main/java/de/rub/nds/crawler/core/ProgressMonitor.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ private String formatTime(double millis) {
9696
@Override
9797
public void consumeDoneNotification(String consumerTag, ScanJobDescription scanJob) {
9898
try {
99-
int totalDone = counters.increaseJobStatusCount(scanJob.getStatus());
100-
int expectedTotal =
99+
long totalDone = counters.increaseJobStatusCount(scanJob.getStatus());
100+
long expectedTotal =
101101
bulkScan.getScanJobsPublished() != 0
102102
? bulkScan.getScanJobsPublished()
103103
: bulkScan.getTargetsGiven();

src/main/java/de/rub/nds/crawler/core/jobs/PublishBulkScanJob.java

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,14 @@
1111
import de.rub.nds.crawler.config.ControllerCommandConfig;
1212
import de.rub.nds.crawler.constant.JobStatus;
1313
import de.rub.nds.crawler.core.ProgressMonitor;
14-
import de.rub.nds.crawler.data.BulkScan;
15-
import de.rub.nds.crawler.data.BulkScanInfo;
16-
import de.rub.nds.crawler.data.ScanJobDescription;
17-
import de.rub.nds.crawler.data.ScanTarget;
14+
import de.rub.nds.crawler.data.*;
1815
import de.rub.nds.crawler.denylist.IDenylistProvider;
1916
import de.rub.nds.crawler.orchestration.IOrchestrationProvider;
2017
import de.rub.nds.crawler.persistence.IPersistenceProvider;
2118
import de.rub.nds.crawler.targetlist.ITargetListProvider;
2219
import java.util.List;
23-
import java.util.Objects;
20+
import java.util.function.Function;
21+
import java.util.stream.Collectors;
2422
import org.apache.logging.log4j.LogManager;
2523
import org.apache.logging.log4j.Logger;
2624
import org.quartz.Job;
@@ -63,30 +61,24 @@ public void execute(JobExecutionContext context) throws JobExecutionException {
6361
// create and submit scan jobs for valid hosts
6462
LOGGER.info(
6563
"Filtering out denylisted hosts and hosts where the domain can not be resolved.");
66-
long submittedJobs =
64+
var submitter =
65+
new JobSubmitter(
66+
orchestrationProvider,
67+
persistenceProvider,
68+
denylistProvider,
69+
bulkScan,
70+
controllerConfig.getPort());
71+
var parsedJobStatuses =
6772
targetStringList.parallelStream()
68-
.map(
69-
targetString -> {
70-
ScanTarget target =
71-
ScanTarget.fromTargetString(
72-
targetString,
73-
controllerConfig.getPort(),
74-
denylistProvider);
75-
if (target != null) {
76-
orchestrationProvider.submitScanJob(
77-
new ScanJobDescription(
78-
target,
79-
new BulkScanInfo(bulkScan),
80-
bulkScan.getName(),
81-
bulkScan.getCollectionName(),
82-
JobStatus.TO_BE_EXECUTED));
83-
}
84-
return target;
85-
})
86-
.filter(Objects::nonNull)
87-
.count();
73+
.map(submitter)
74+
.collect(
75+
Collectors.groupingBy(
76+
Function.identity(), Collectors.counting()));
8877

89-
bulkScan.setScanJobsPublished((int) submittedJobs);
78+
long submittedJobs = parsedJobStatuses.get(JobStatus.TO_BE_EXECUTED);
79+
bulkScan.setScanJobsPublished(submittedJobs);
80+
bulkScan.setScanJobsResolutionErrors(parsedJobStatuses.get(JobStatus.UNRESOLVABLE));
81+
bulkScan.setScanJobsDenylisted(parsedJobStatuses.get(JobStatus.DENYLISTED));
9082
persistenceProvider.updateBulkScan(bulkScan);
9183

9284
if (controllerConfig.isMonitored() && submittedJobs == 0) {
@@ -100,4 +92,40 @@ public void execute(JobExecutionContext context) throws JobExecutionException {
10092
throw e2;
10193
}
10294
}
95+
96+
private static class JobSubmitter implements Function<String, JobStatus> {
97+
private final IOrchestrationProvider orchestrationProvider;
98+
private final IPersistenceProvider persistenceProvider;
99+
private final IDenylistProvider denylistProvider;
100+
private final BulkScan bulkScan;
101+
private final int defaultPort;
102+
103+
public JobSubmitter(
104+
IOrchestrationProvider orchestrationProvider,
105+
IPersistenceProvider persistenceProvider,
106+
IDenylistProvider denylistProvider,
107+
BulkScan bulkScan,
108+
int defaultPort) {
109+
this.orchestrationProvider = orchestrationProvider;
110+
this.persistenceProvider = persistenceProvider;
111+
this.denylistProvider = denylistProvider;
112+
this.bulkScan = bulkScan;
113+
this.defaultPort = defaultPort;
114+
}
115+
116+
@Override
117+
public JobStatus apply(String targetString) {
118+
var targetInfo =
119+
ScanTarget.fromTargetString(targetString, defaultPort, denylistProvider);
120+
ScanJobDescription jobDescription =
121+
new ScanJobDescription(targetInfo.getLeft(), bulkScan, targetInfo.getRight());
122+
if (jobDescription.getStatus() == JobStatus.TO_BE_EXECUTED) {
123+
orchestrationProvider.submitScanJob(jobDescription);
124+
} else {
125+
persistenceProvider.insertScanResult(
126+
new ScanResult(jobDescription, null), jobDescription);
127+
}
128+
return jobDescription.getStatus();
129+
}
130+
}
103131
}

src/main/java/de/rub/nds/crawler/data/BulkScan.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,9 @@ public class BulkScan implements Serializable {
3737

3838
private int targetsGiven;
3939

40-
private int scanJobsPublished;
40+
private long scanJobsPublished;
41+
private long scanJobsResolutionErrors;
42+
private long scanJobsDenylisted;
4143

4244
private int successfulScans;
4345

@@ -111,7 +113,7 @@ public int getTargetsGiven() {
111113
return this.targetsGiven;
112114
}
113115

114-
public int getScanJobsPublished() {
116+
public long getScanJobsPublished() {
115117
return this.scanJobsPublished;
116118
}
117119

@@ -168,7 +170,7 @@ public void setTargetsGiven(int targetsGiven) {
168170
this.targetsGiven = targetsGiven;
169171
}
170172

171-
public void setScanJobsPublished(int scanJobsPublished) {
173+
public void setScanJobsPublished(long scanJobsPublished) {
172174
this.scanJobsPublished = scanJobsPublished;
173175
}
174176

@@ -195,4 +197,20 @@ public Map<JobStatus, Integer> getJobStatusCounters() {
195197
public void setJobStatusCounters(Map<JobStatus, Integer> jobStatusCounters) {
196198
this.jobStatusCounters = jobStatusCounters;
197199
}
200+
201+
public long getScanJobsResolutionErrors() {
202+
return scanJobsResolutionErrors;
203+
}
204+
205+
public void setScanJobsResolutionErrors(long scanJobsResolutionErrors) {
206+
this.scanJobsResolutionErrors = scanJobsResolutionErrors;
207+
}
208+
209+
public long getScanJobsDenylisted() {
210+
return scanJobsDenylisted;
211+
}
212+
213+
public void setScanJobsDenylisted(long scanJobsDenylisted) {
214+
this.scanJobsDenylisted = scanJobsDenylisted;
215+
}
198216
}

src/main/java/de/rub/nds/crawler/data/ScanJobDescription.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ public ScanJobDescription(
4343
this.status = status;
4444
}
4545

46+
public ScanJobDescription(ScanTarget scanTarget, BulkScan bulkScan, JobStatus status) {
47+
this(
48+
scanTarget,
49+
new BulkScanInfo(bulkScan),
50+
bulkScan.getName(),
51+
bulkScan.getCollectionName(),
52+
status);
53+
}
54+
4655
private void readObject(java.io.ObjectInputStream in)
4756
throws IOException, ClassNotFoundException {
4857
// handle deserialization, cf. https://stackoverflow.com/a/3960558

src/main/java/de/rub/nds/crawler/data/ScanTarget.java

Lines changed: 53 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
*/
99
package de.rub.nds.crawler.data;
1010

11+
import de.rub.nds.crawler.constant.JobStatus;
1112
import de.rub.nds.crawler.denylist.IDenylistProvider;
1213
import java.io.Serializable;
1314
import java.net.InetAddress;
1415
import java.net.UnknownHostException;
16+
import org.apache.commons.lang3.tuple.Pair;
1517
import org.apache.commons.validator.routines.InetAddressValidator;
1618
import org.apache.logging.log4j.LogManager;
1719
import org.apache.logging.log4j.Logger;
@@ -28,56 +30,65 @@ public class ScanTarget implements Serializable {
2830
* @param denylistProvider which provides info if a host is denylisted
2931
* @return ScanTarget object
3032
*/
31-
public static ScanTarget fromTargetString(
33+
public static Pair<ScanTarget, JobStatus> fromTargetString(
3234
String targetString, int defaultPort, IDenylistProvider denylistProvider) {
33-
ScanTarget target;
34-
try {
35-
target = new ScanTarget();
36-
// check if targetString contains rank (e.g. "1,example.com")
37-
38-
if (targetString.contains(",")) {
39-
if (targetString.split(",")[0].chars().allMatch(Character::isDigit)) {
40-
target.setTrancoRank(Integer.parseInt(targetString.split(",")[0]));
41-
targetString = targetString.split(",")[1];
42-
} else {
43-
targetString = "";
44-
}
45-
}
35+
ScanTarget target = new ScanTarget();
4636

47-
// Formatting for MX hosts
48-
if (targetString.contains("//")) {
49-
targetString = targetString.split("//")[1];
50-
}
51-
if (targetString.startsWith("\"") && targetString.endsWith("\"")) {
52-
targetString = targetString.replace("\"", "");
53-
System.out.println(targetString);
37+
// check if targetString contains rank (e.g. "1,example.com")
38+
if (targetString.contains(",")) {
39+
if (targetString.split(",")[0].chars().allMatch(Character::isDigit)) {
40+
target.setTrancoRank(Integer.parseInt(targetString.split(",")[0]));
41+
targetString = targetString.split(",")[1];
42+
} else {
43+
targetString = "";
5444
}
45+
}
5546

56-
// check if targetString contains port (e.g. "www.example.com:8080")
57-
if (targetString.contains(":")) {
58-
int port = Integer.parseInt(targetString.split(":")[1]);
59-
targetString = targetString.split(":")[0];
60-
if (port > 1 && port < 65535) {
61-
target.setPort(port);
62-
}
63-
} else {
64-
target.setPort(defaultPort);
47+
// Formatting for MX hosts
48+
if (targetString.contains("//")) {
49+
targetString = targetString.split("//")[1];
50+
}
51+
if (targetString.startsWith("\"") && targetString.endsWith("\"")) {
52+
targetString = targetString.replace("\"", "");
53+
System.out.println(targetString);
54+
}
55+
56+
// check if targetString contains port (e.g. "www.example.com:8080")
57+
// FIXME I guess this breaks any IPv6 parsing
58+
if (targetString.contains(":")) {
59+
int port = Integer.parseInt(targetString.split(":")[1]);
60+
targetString = targetString.split(":")[0];
61+
if (port > 1 && port < 65535) {
62+
target.setPort(port);
6563
}
66-
if (InetAddressValidator.getInstance().isValid(targetString)) {
67-
target.setIp(targetString);
68-
} else {
64+
} else {
65+
target.setPort(defaultPort);
66+
}
67+
68+
if (InetAddressValidator.getInstance().isValid(targetString)) {
69+
target.setIp(targetString);
70+
} else {
71+
target.setHostname(targetString);
72+
try {
73+
// TODO this only allows one IP per hostname; it may be interesting to scan all IPs
74+
// for a domain, or at least one v4 and one v6
6975
target.setIp(InetAddress.getByName(targetString).getHostAddress());
70-
target.setHostname(targetString);
71-
}
72-
if (denylistProvider != null && denylistProvider.isDenylisted(target)) {
73-
LOGGER.error("Host {} is blacklisted and will not be scanned.", targetString);
76+
} catch (UnknownHostException e) {
77+
LOGGER.error(
78+
"Host {} is unknown or can not be reached with error {}.", targetString, e);
79+
// TODO in the current design we discard the exception info; maybe we want to keep
80+
// this in the future
81+
return Pair.of(target, JobStatus.UNRESOLVABLE);
7482
}
75-
} catch (UnknownHostException e) {
76-
LOGGER.error(
77-
"Host {} is unknown or can not be reached with error {}.", targetString, e);
78-
return null;
7983
}
80-
return target;
84+
if (denylistProvider != null && denylistProvider.isDenylisted(target)) {
85+
LOGGER.error("Host {} is denylisted and will not be scanned.", targetString);
86+
// TODO similar to the unknownHostException, we do not keep any information as to why
87+
// the target is blocklisted it may be nice to distinguish cases where the domain is
88+
// blocked or where the IP is blocked
89+
return Pair.of(target, JobStatus.DENYLISTED);
90+
}
91+
return Pair.of(target, JobStatus.TO_BE_EXECUTED);
8192
}
8293

8394
private String ip;

0 commit comments

Comments
 (0)