diff --git a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java index 1a939c9c0..cbcd4bf72 100644 --- a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java +++ b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java @@ -18,6 +18,7 @@ package org.apache.stormcrawler.protocol.okhttp; import java.io.IOException; +import java.io.InterruptedIOException; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Proxy; @@ -160,6 +161,10 @@ public void configure(Config conf) { .writeTimeout(timeout, TimeUnit.MILLISECONDS) .readTimeout(timeout, TimeUnit.MILLISECONDS); + if (completionTimeout >= 0) { + builder.callTimeout(completionTimeout, TimeUnit.SECONDS); + } + // protocols in order of preference, see // https://square.github.io/okhttp/4.x/okhttp/okhttp3/-ok-http-client/-builder/protocols/ final List protocols = new ArrayList<>(); @@ -430,17 +435,18 @@ public ProtocolResponse getProtocolOutput(String url, final Metadata metadata) responsemetadata.addValue(key.toLowerCase(Locale.ROOT), value); } - final MutableObject trimmed = new MutableObject(TrimmedContentReason.NOT_TRIMMED); + final MutableObject trimmed = + new MutableObject<>(TrimmedContentReason.NOT_TRIMMED); final byte[] bytes = toByteArray(response.body(), pageMaxContent, trimmed); - if (trimmed.getValue() != TrimmedContentReason.NOT_TRIMMED) { + if (trimmed.get() != TrimmedContentReason.NOT_TRIMMED) { if (!call.isCanceled()) { call.cancel(); } responsemetadata.setValue(ProtocolResponse.TRIMMED_RESPONSE_KEY, "true"); responsemetadata.setValue( ProtocolResponse.TRIMMED_RESPONSE_REASON_KEY, - trimmed.getValue().toString().toLowerCase(Locale.ROOT)); - LOG.warn("HTTP content trimmed to {}", bytes.length); + trimmed.get().toString().toLowerCase(Locale.ROOT)); + LOG.warn("HTTP content trimmed to {} (reason: {})", bytes.length, trimmed.get()); } final Long dnsResolution = DNStimes.remove(call.toString()); @@ -453,7 +459,9 @@ public ProtocolResponse getProtocolOutput(String url, final Metadata metadata) } private byte[] toByteArray( - final ResponseBody responseBody, int maxContent, MutableObject trimmed) + final ResponseBody responseBody, + int maxContent, + MutableObject trimmed) throws IOException { if (responseBody == null) { @@ -493,7 +501,12 @@ private byte[] toByteArray( // requesting more content failed, e.g. by a socket timeout if (partialContentAsTrimmed && source.getBuffer().size() > 0) { // treat already fetched content as trimmed - trimmed.setValue(TrimmedContentReason.DISCONNECT); + if (e instanceof InterruptedIOException) { + // thrown by OkHttp if the call timeout is hit + trimmed.setValue(TrimmedContentReason.TIME); + } else { + trimmed.setValue(TrimmedContentReason.DISCONNECT); + } LOG.debug("Exception while fetching {}", e); } else { throw e;