Bean Reference

Note

This reference is a work in progress and does not yet cover all available beans. For a more complete list of Heritrix beans please refer to the javadoc.

Core Beans

ActionDirectory

Directory watched for new files. Depending on their extension, willprocess with regard to current crawl, and rename with a datestamp into the ‘done’ directory.

Currently supports: - .seeds(.gz)     add each URI found in file as a new seed (to be crawled     if not already; to affect scope if appropriate). - (.s).recover(.gz)     treat as traditional recovery log: consider all ‘Fs’-tagged lines     included, then try-rescheduling all ‘F+’-tagged lines. (If “.s.”     present, try scoping URIs before including/scheduling.) - (.s).include(.gz)     add each URI found in a recover-log like file (regardless of its     tagging) to the frontier’s alreadyIncluded filter, preventing them     from being recrawled. (‘.s.’ indicates to apply scoping.) - (.s).schedule(.gz)     add each URI found in a recover-log like file (regardless of its     tagging) to the frontier’s queues. (‘.s.’ indicates to apply     scoping.)     Future support planned: - .robots: invalidate robots ASAP - (?) .block: block-all on named site(s) -  .overlay: add new overlay settings - .js .rb .bsh .rb etc - execute arbitrary script (a la ScriptedProcessor)

<bean id="actionDirectory" class="org.archive.crawler.framework.ActionDirectory">
  <!-- <property name="actionDir" value="" /> -->
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="delaySeconds" value="30" /> -->
  <!-- <property name="doneDir" value="" /> -->
  <!-- <property name="frontier" value="" /> -->
  <!-- <property name="initialDelaySeconds" value="10" /> -->
  <!-- <property name="seeds" value="" /> -->
</bean>
import org.archive.crawler.framework.ActionDirectory

actionDirectory(ActionDirectory) {
    // actionDir = ''
    // applicationContext = ''
    // delaySeconds = 30
    // doneDir = ''
    // frontier = ''
    // initialDelaySeconds = 10
    // seeds = ''
}
actionDir
(org.archive.spring.ConfigPath)
applicationContext
(org.springframework.context.ApplicationContext)
delaySeconds
(int) delay between scans of actionDirectory for new files
doneDir
(org.archive.spring.ConfigPath)
frontier
(org.archive.crawler.framework.Frontier) autowired frontier for actions
initialDelaySeconds
(int) how long after crawl start to first scan action directory
seeds
(org.archive.modules.seeds.SeedModule)

BdbCookieStore

Cookie store using bdb for storage. Cookies are stored in a SortedMap keyedby #sortableKey(Cookie), so they are grouped together by domain.#cookieStoreFor(String) returns a facade whoseCookieStore#getCookies() returns a list of cookies limited tothe supplied host and parent domains, if applicable.

<bean id="bdbCookieStore" class="org.archive.modules.fetcher.BdbCookieStore">
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="cookiesLoadFile" value="" /> -->
  <!-- <property name="cookiesSaveFile" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
</bean>
import org.archive.modules.fetcher.BdbCookieStore

bdbCookieStore(BdbCookieStore) {
    // bdbModule = ''
    // cookiesLoadFile = ''
    // cookiesSaveFile = ''
    // recoveryCheckpoint = ''
}
bdbModule
(org.archive.bdb.BdbModule)
cookiesLoadFile
(org.archive.spring.ConfigFile)
cookiesSaveFile
(org.archive.spring.ConfigPath)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)

BdbFrontier

A Frontier using several BerkeleyDB JE Databases to hold its record ofknown hosts (queues), and pending URIs.

<bean id="bdbFrontier" class="org.archive.crawler.frontier.BdbFrontier">
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="balanceReplenishAmount" value="3000" /> -->
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="dumpPendingAtClose" value="false" /> -->
  <!-- <property name="errorPenaltyAmount" value="100" /> -->
  <!-- <property name="extract404s" value="true" /> -->
  <!-- <property name="extractIndependently" value="false" /> -->
  <!-- <property name="frontierPreparer" value="" /> -->
  <!-- <property name="largestQueuesCount" value="" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="maxOutlinks" value="6000" /> -->
  <!-- <property name="maxQueuesPerReportCategory" value="2000" /> -->
  <!-- <property name="maxRetries" value="30" /> -->
  <!-- <property name="precedenceFloor" value="255" /> -->
  <!-- <property name="queuePrecedencePolicy" value="" /> -->
  <!-- <property name="queueTotalBudget" value="-1" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="recoveryLogEnabled" value="true" /> -->
  <!-- <property name="retryDelaySeconds" value="900" /> -->
  <!-- <property name="scope" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="sheetOverlaysManager" value="" /> -->
  <!-- <property name="snoozeLongMs" value="" /> -->
  <!-- <property name="uriUniqFilter" value="" /> -->
</bean>
import org.archive.crawler.frontier.BdbFrontier

bdbFrontier(BdbFrontier) {
    // applicationContext = ''
    // balanceReplenishAmount = 3000
    // bdbModule = ''
    // crawlController = ''
    // dumpPendingAtClose = false
    // errorPenaltyAmount = 100
    // extract404s = true
    // extractIndependently = false
    // frontierPreparer = ''
    // largestQueuesCount = 0
    // loggerModule = ''
    // maxOutlinks = 6000
    // maxQueuesPerReportCategory = 2000
    // maxRetries = 30
    // precedenceFloor = 255
    // queuePrecedencePolicy = ''
    // queueTotalBudget = -1
    // recoveryCheckpoint = ''
    // recoveryLogEnabled = true
    // retryDelaySeconds = 900
    // scope = ''
    // seeds = ''
    // serverCache = ''
    // sheetOverlaysManager = ''
    // snoozeLongMs = 0
    // uriUniqFilter = ''
}
applicationContext
(org.springframework.context.ApplicationContext)
balanceReplenishAmount
(int) amount to replenish budget on each activation (duty cycle)
bdbModule
(org.archive.bdb.BdbModule)
crawlController
(org.archive.crawler.framework.CrawlController)
dumpPendingAtClose
(boolean)
errorPenaltyAmount
(int) budget penalty for an error fetch
extract404s
(boolean)
extractIndependently
(boolean)
frontierPreparer
(org.archive.crawler.prefetch.FrontierPreparer)
largestQueuesCount
(int)
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
maxOutlinks
(int)
maxQueuesPerReportCategory
(int) truncate reporting of queues at this large but not unbounded number
maxRetries
(int) maximum times to emit a CrawlURI without final disposition
precedenceFloor
(int) precedence rank at or below which queues are not crawled
queuePrecedencePolicy
(org.archive.crawler.frontier.precedence.QueuePrecedencePolicy) queue precedence assignment policy to use.
queueTotalBudget
(long) total expenditure to allow a queue before ‘retiring’ it
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
recoveryLogEnabled
(boolean) Recover log on or off attribute.
retryDelaySeconds
(int) for retryable problems, seconds to wait before a retry
scope
(org.archive.modules.deciderules.DecideRule)
seeds
(org.archive.modules.seeds.SeedModule)
serverCache
(org.archive.modules.net.ServerCache)
sheetOverlaysManager
(org.archive.crawler.spring.SheetOverlaysManager)
snoozeLongMs
(long) When a snooze target for a queue is longer than this amount, the queuewill be “long snoozed” instead of “short snoozed”.  A “long snoozed”queue may be swapped to disk because it’s not needed soon.
uriUniqFilter
(org.archive.crawler.datamodel.UriUniqFilter) The UriUniqFilter to use, tracking those UURIs which are already in-process (or processed), and thus should not be rescheduled. Also known as the ‘alreadyIncluded’ or’alreadySeen’ structure

BdbModule

Utility module for managing a shared BerkeleyDB-JE environment

<bean id="bdbModule" class="org.archive.bdb.BdbModule">
  <!-- <property name="cachePercent" value="-1" /> -->
  <!-- <property name="cacheSize" value="-1" /> -->
  <!-- <property name="cleanerThreads" value="" /> -->
  <!-- <property name="dir" value="" /> -->
  <!-- <property name="evictorCoreThreads" value="-1" /> -->
  <!-- <property name="evictorMaxThreads" value="-1" /> -->
  <!-- <property name="expectedConcurrency" value="64" /> -->
  <!-- <property name="maxLogFileSize" value="10000000" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="useHardLinkCheckpoints" value="true" /> -->
  <!-- <property name="useSharedCache" value="true" /> -->
</bean>
import org.archive.bdb.BdbModule

bdbModule(BdbModule) {
    // cachePercent = -1
    // cacheSize = -1
    // cleanerThreads = 0
    // dir = ''
    // evictorCoreThreads = -1
    // evictorMaxThreads = -1
    // expectedConcurrency = 64
    // maxLogFileSize = 10000000
    // recoveryCheckpoint = ''
    // useHardLinkCheckpoints = true
    // useSharedCache = true
}
cachePercent
(int)
cacheSize
(int)
cleanerThreads
(int)
dir
(org.archive.spring.ConfigPath)
evictorCoreThreads
(int) Configure the number of evictor threads (-1 means use the default)https://docs.oracle.com/cd/E17277_02/html/java/com/sleepycat/je/EnvironmentConfig.html#EVICTOR_CORE_THREADS
evictorMaxThreads
(int) Configure the maximum number of evictor threads (-1 means use the default)https://docs.oracle.com/cd/E17277_02/html/java/com/sleepycat/je/EnvironmentConfig.html#EVICTOR_MAX_THREADS
expectedConcurrency
(int) Expected number of concurrent threads; used to tune nLockTablesaccording to JE FAQhttp://www.oracle.com/technology/products/berkeley-db/faq/je_faq.html#33
maxLogFileSize
(long)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
useHardLinkCheckpoints
(boolean) Whether to use hard-links to log files to collect/retainthe BDB log files needed for a checkpoint. Default is true. May not work on Windows (especially on pre-NTFS filesystems). If false, the BDB ‘je.cleaner.expunge’ value will be set to ‘false’, as well, meaning BDB will *not* delete obsolete JDBfiles, but only rename the ‘.DEL’. They will have to be manually deleted to free disk space, but .DEL files referencedin any checkpoint’s ‘jdbfiles.manifest’ should be retained tokeep the checkpoint valid.
useSharedCache
(boolean)

BdbServerCache

ServerCache backed by BDB big maps; the usual choice for crawls.

<bean id="bdbServerCache" class="org.archive.modules.net.BdbServerCache">
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
</bean>
import org.archive.modules.net.BdbServerCache

bdbServerCache(BdbServerCache) {
    // bdbModule = ''
    // recoveryCheckpoint = ''
}
bdbModule
(org.archive.bdb.BdbModule)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)

BdbUriUniqFilter

A BDB implementation of an AlreadySeen list.

This implementation performs adequately without blowing out the heap. SeeAlreadySeen.

Makes keys that have URIs from same server close to each other.  Mercatorand 2.3.5 ‘Elminating Already-Visited URLs’ in ‘Mining the Web’ by SoumenChakrabarti talk of a two-level key with the first 24 bits a hash of thehost plus port and with the last 40 as a hash of the path.  Testingshowed adoption of such a scheme halving lookup times (Tutilhis implementationactually concatenates scheme + host in first 24 bits and path + query intrailing 40 bits).

<bean id="bdbUriUniqFilter" class="org.archive.crawler.util.BdbUriUniqFilter">
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="destination" value="" /> -->
  <!-- <property name="profileLog" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
</bean>
import org.archive.crawler.util.BdbUriUniqFilter

bdbUriUniqFilter(BdbUriUniqFilter) {
    // bdbModule = ''
    // destination = ''
    // profileLog = ''
    // recoveryCheckpoint = ''
}
bdbModule
(org.archive.bdb.BdbModule)
destination
(org.archive.crawler.datamodel.UriUniqFilter.CrawlUriReceiver)
profileLog
(java.io.File)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)

CheckpointService

Executes checkpoints, and offers convenience methods for enumerating available Checkpoints and injecting a recovery-Checkpoint after build and before launch (setRecoveryCheckpointByName).

Offers optional automatic checkpointing at a configurable interval in minutes.

<bean id="checkpointService" class="org.archive.crawler.framework.CheckpointService">
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="checkpointIntervalMinutes" value="-1" /> -->
  <!-- <property name="checkpointOnShutdown" value="false" /> -->
  <!-- <property name="checkpointsDir" value="" /> -->
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="forgetAllButLatest" value="false" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="recoveryCheckpointByName" value="" /> -->
</bean>
import org.archive.crawler.framework.CheckpointService

checkpointService(CheckpointService) {
    // applicationContext = ''
    // checkpointIntervalMinutes = -1
    // checkpointOnShutdown = false
    // checkpointsDir = ''
    // crawlController = ''
    // forgetAllButLatest = false
    // recoveryCheckpoint = ''
    // recoveryCheckpointByName = ''
}
applicationContext
(org.springframework.context.ApplicationContext)
checkpointIntervalMinutes
(long) Period at which to create automatic checkpoints; -1 meansno auto checkpointing.
checkpointOnShutdown
(boolean) Whether a checkpoint should be made when the JVM is shutdown.Default is false.
checkpointsDir
(org.archive.spring.ConfigPath) Checkpoints directory
crawlController
(org.archive.crawler.framework.CrawlController)
forgetAllButLatest
(boolean) True to save only the latest checkpoint, false to save all of them.Default is false.
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
recoveryCheckpointByName
(java.lang.String) Given the name of a valid checkpoint subdirectory in the checkpointsdirectory, create a Checkpoint instance, and insert it into all Checkpointable beans.

@param selectedCheckpoint

CrawlController

CrawlController collects all the classes which cooperate toperform a crawl and provides a high-level interface to therunning crawl.

As the “global context” for a crawl, subcomponents willoften reach each other through the CrawlController.

<bean id="crawlController" class="org.archive.crawler.framework.CrawlController">
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="candidateChain" value="" /> -->
  <!-- <property name="dispositionChain" value="" /> -->
  <!-- <property name="fetchChain" value="" /> -->
  <!-- <property name="frontier" value="" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="maxToeThreads" value="" /> -->
  <!-- <property name="metadata" value="" /> -->
  <!-- <property name="pauseAtStart" value="true" /> -->
  <!-- <property name="recorderInBufferBytes" value="" /> -->
  <!-- <property name="recorderOutBufferBytes" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="runWhileEmpty" value="false" /> -->
  <!-- <property name="scratchDir" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="statisticsTracker" value="" /> -->
</bean>
import org.archive.crawler.framework.CrawlController

crawlController(CrawlController) {
    // applicationContext = ''
    // candidateChain = ''
    // dispositionChain = ''
    // fetchChain = ''
    // frontier = ''
    // loggerModule = ''
    // maxToeThreads = 0
    // metadata = ''
    // pauseAtStart = true
    // recorderInBufferBytes = 0
    // recorderOutBufferBytes = 0
    // recoveryCheckpoint = ''
    // runWhileEmpty = false
    // scratchDir = ''
    // seeds = ''
    // serverCache = ''
    // statisticsTracker = ''
}
applicationContext
(org.springframework.context.ApplicationContext)
candidateChain
(org.archive.modules.CandidateChain) Candidate chain
dispositionChain
(org.archive.modules.DispositionChain) Disposition chain
fetchChain
(org.archive.modules.FetchChain) Fetch chain
frontier
(org.archive.crawler.framework.Frontier) The frontier to use for the crawl.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
maxToeThreads
(int) Maximum number of threads processing URIs at the same time.
metadata
(org.archive.modules.CrawlMetadata)
pauseAtStart
(boolean) whether to pause at crawl start
recorderInBufferBytes
(int) Size in bytes of in-memory buffer to record inbound traffic. One such buffer is reserved for every ToeThread.
recorderOutBufferBytes
(int) Size in bytes of in-memory buffer to record outbound traffic. One such buffer is reserved for every ToeThread.
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
runWhileEmpty
(boolean) whether to keep running (without pause or finish) when frontier is empty
scratchDir
(org.archive.spring.ConfigPath) Scratch directory for temporary overflow-to-disk
seeds
(org.archive.modules.seeds.SeedModule)
serverCache
(org.archive.modules.net.ServerCache)
statisticsTracker
(org.archive.crawler.reporting.StatisticsTracker) Statistics tracking modules.  Any number of specialized statistics trackers that monitor a crawl and write logs, reports and/or provide information to the user interface.

CrawlerLoggerModule

Module providing all expected whole-crawl logging facilities

<bean id="crawlerLoggerModule" class="org.archive.crawler.reporting.CrawlerLoggerModule">
  <!-- <property name="alertsLogPath" value="" /> -->
  <!-- <property name="crawlLogPath" value="" /> -->
  <!-- <property name="logExtraInfo" value="false" /> -->
  <!-- <property name="nonfatalErrorsLogPath" value="" /> -->
  <!-- <property name="path" value="" /> -->
  <!-- <property name="progressLogPath" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="runtimeErrorsLogPath" value="" /> -->
  <!-- <property name="upSimpleLog" value="" /> -->
  <!-- <property name="uriErrorsLogPath" value="" /> -->
</bean>
import org.archive.crawler.reporting.CrawlerLoggerModule

crawlerLoggerModule(CrawlerLoggerModule) {
    // alertsLogPath = ''
    // crawlLogPath = ''
    // logExtraInfo = false
    // nonfatalErrorsLogPath = ''
    // path = ''
    // progressLogPath = ''
    // recoveryCheckpoint = ''
    // runtimeErrorsLogPath = ''
    // upSimpleLog = ''
    // uriErrorsLogPath = ''
}
alertsLogPath
(org.archive.spring.ConfigPath)
crawlLogPath
(org.archive.spring.ConfigPath)
logExtraInfo
(boolean) Whether to include the “extra info” field for each entry in crawl.log.”Extra info” is arbitrary JSON. It is the last field of the log line.
nonfatalErrorsLogPath
(org.archive.spring.ConfigPath)
path
(org.archive.spring.ConfigPath)
progressLogPath
(org.archive.spring.ConfigPath)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
runtimeErrorsLogPath
(org.archive.spring.ConfigPath)
upSimpleLog
(java.lang.String)
uriErrorsLogPath
(org.archive.spring.ConfigPath)

CrawlLimitEnforcer

Bean to enforce limits on the size of a crawl in URI count,byte count, or elapsed time. Fires off the StatSnapshotEvent,so only checks at the interval (configured in StatisticsTracker)of those events.

<bean id="crawlLimitEnforcer" class="org.archive.crawler.framework.CrawlLimitEnforcer">
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="maxBytesDownload" value="0" /> -->
  <!-- <property name="maxDocumentsDownload" value="0" /> -->
  <!-- <property name="maxNovelBytes" value="0" /> -->
  <!-- <property name="maxNovelUrls" value="0" /> -->
  <!-- <property name="maxTimeSeconds" value="0" /> -->
  <!-- <property name="maxWarcNovelBytes" value="0" /> -->
  <!-- <property name="maxWarcNovelUrls" value="0" /> -->
</bean>
import org.archive.crawler.framework.CrawlLimitEnforcer

crawlLimitEnforcer(CrawlLimitEnforcer) {
    // crawlController = ''
    // maxBytesDownload = 0
    // maxDocumentsDownload = 0
    // maxNovelBytes = 0
    // maxNovelUrls = 0
    // maxTimeSeconds = 0
    // maxWarcNovelBytes = 0
    // maxWarcNovelUrls = 0
}
crawlController
(org.archive.crawler.framework.CrawlController)
maxBytesDownload
(long) Maximum number of bytes to download. Once this number is exceeded the crawler will stop. A value of zero means no upper limit.
maxDocumentsDownload
(long) Maximum number of documents to download. Once this number is exceeded the crawler will stop. A value of zero means no upper limit.
maxNovelBytes
(long) Maximum number of uncompressed payload bytes to write to WARC response orresource records. Once this number is exceeded the crawler will stop. Avalue of zero means no upper limit.
maxNovelUrls
(long) Maximum number of novel (not deduplicated) urls to download. Once thisnumber is exceeded the crawler will stop. A value of zero means no upperlimit.
maxTimeSeconds
(long) Maximum amount of time to crawl (in seconds). Once this much time has elapsed the crawler will stop. A value of zero means no upper limit.
maxWarcNovelBytes
(long) Maximum number of novel (not deduplicated) bytes to write to WARCresponse or resource records. Once this number is exceeded the crawlerwill stop. A value of zero means no upper limit.
maxWarcNovelUrls
(long) Maximum number of urls to write to WARC response or resource records.Once this number is exceeded the crawler will stop. A value of zero meansno upper limit.

CrawlMetadata

Basic crawl metadata, as consulted by functional modules andrecorded in ARCs/WARCs.

<bean id="crawlMetadata" class="org.archive.modules.CrawlMetadata">
  <!-- <property name="audience" value="" /> -->
  <!-- <property name="availableRobotsPolicies" value="" /> -->
  <!-- <property name="description" value="" /> -->
  <!-- <property name="jobName" value="" /> -->
  <!-- <property name="operator" value="" /> -->
  <!-- <property name="operatorContactUrl" value="" /> -->
  <!-- <property name="operatorFrom" value="" /> -->
  <!-- <property name="organization" value="" /> -->
  <!-- <property name="robotsPolicyName" value="obey" /> -->
  <!-- <property name="userAgentTemplate" value="Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)" /> -->
</bean>
import org.archive.modules.CrawlMetadata

crawlMetadata(CrawlMetadata) {
    // audience = ''
    // availableRobotsPolicies = ''
    // description = ''
    // jobName = ''
    // operator = ''
    // operatorContactUrl = ''
    // operatorFrom = ''
    // organization = ''
    // robotsPolicyName = 'obey'
    // userAgentTemplate = 'Mozilla/5.0 (compatible; heritrix/@VERSION@ +@OPERATOR_CONTACT_URL@)'
}
audience
(java.lang.String)
availableRobotsPolicies
(java.util.Map<java.lang.String,org.archive.modules.net.RobotsPolicy>) Map of all available RobotsPolicies, by name, to choose from. assembled from declared instances in configuration plus the standard’obey’ (aka ‘classic’) and ‘ignore’ policies.
description
(java.lang.String)
jobName
(java.lang.String)
operator
(java.lang.String)
operatorContactUrl
(java.lang.String)
operatorFrom
(java.lang.String)
organization
(java.lang.String)
robotsPolicyName
(java.lang.String) Robots policy name
userAgentTemplate
(java.lang.String)

CredentialStore

Front door to the credential store.

Come here to get at credentials.

See href=”http://crawler.archive.org/proposals/auth/#credentialstoredesign”>CredentialStore Design.

<bean id="credentialStore" class="org.archive.modules.credential.CredentialStore">
  <!-- <property name="credentials" value="" /> -->
</bean>
import org.archive.modules.credential.CredentialStore

credentialStore(CredentialStore) {
    // credentials = ''
}
credentials
(java.util.Map<java.lang.String,org.archive.modules.credential.Credential>) Credentials used by heritrix authenticating. Seehttp://crawler.archive.org/proposals/auth/ for background.

DecideRuleSequence

<bean id="decideRuleSequence" class="org.archive.modules.deciderules.DecideRuleSequence">
  <!-- <property name="logExtraInfo" value="false" /> -->
  <!-- <property name="logToFile" value="false" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="rules" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.modules.deciderules.DecideRuleSequence

decideRuleSequence(DecideRuleSequence) {
    // logExtraInfo = false
    // logToFile = false
    // loggerModule = ''
    // rules = ''
    // serverCache = ''
}
logExtraInfo
(boolean) Whether to include the “extra info” field for each entry in crawl.log.”Extra info” is a json object with entries “host”, “via”, “source” and”hopPath”.
logToFile
(boolean) If enabled, log decisions to file named logs/{spring-bean-id}.log. Formatis: [timestamp] [decisive-rule-num] [decisive-rule-class] [decision][uri] [extraInfo]

Relies on Spring Lifecycle to initialize the log. Only top-levelbeans get the Lifecycle treatment from Spring, so bean must be top-levelfor logToFile to work. (This is true of other modules that supportlogToFile, and anything else that uses Lifecycle, as well.)

loggerModule
(org.archive.modules.SimpleFileLoggerProvider)
rules
(java.util.List<org.archive.modules.deciderules.DecideRule>)
serverCache
(org.archive.modules.net.ServerCache)

DiskSpaceMonitor

Monitors the available space on the paths configured. If the available spacedrops below a specified threshold a crawl pause is requested.

Monitoring is done via the java.io.File.getUsableSpace() method.This method will sometimes fail on network attached storage, returning 0bytes available even if that is not actually the case.

Paths that do not resolve to actual filesystem folders or files will not beevaluated (i.e. if java.io.File.exists() returns falseno further processing is carried out on that File).

Paths are checked available space whenever a StatSnapshotEvent occurs.

<bean id="diskSpaceMonitor" class="org.archive.crawler.monitor.DiskSpaceMonitor">
  <!-- <property name="configPathConfigurer" value="" /> -->
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="monitorConfigPaths" value="true" /> -->
  <!-- <property name="monitorPaths" value="" /> -->
  <!-- <property name="pauseThresholdMiB" value="8192" /> -->
</bean>
import org.archive.crawler.monitor.DiskSpaceMonitor

diskSpaceMonitor(DiskSpaceMonitor) {
    // configPathConfigurer = ''
    // crawlController = ''
    // monitorConfigPaths = true
    // monitorPaths = ''
    // pauseThresholdMiB = 8192
}
configPathConfigurer
(org.archive.spring.ConfigPathConfigurer) Autowire access to ConfigPathConfigurer
crawlController
(org.archive.crawler.framework.CrawlController) Autowire access to CrawlController
monitorConfigPaths
(boolean) If enabled, all the paths returned by ConfigPathConfigurer#getAllConfigPaths()will be monitored in addition to any paths explicitly specified via#setMonitorPaths(List).

true by default.

Note: This is not guaranteed to contain all paths that Heritrix writes to.It is the responsibility of modules that write to disk to register their activitywith the ConfigPathConfigurer and some may not do so.

@param monitorConfigPaths If config paths should be monitored for usable space.

monitorPaths
(java.util.List<java.lang.String>) @param monitorPaths List of filesystem paths that should be monitored for available space.
pauseThresholdMiB
(long) Set the minimum amount of space that must be available on all monitored paths.If the amount falls below this pause threshold on any path the crawl will be paused. @param pauseThresholdMiB The desired pause threshold value.                         Specified in megabytes (MiB).

RulesCanonicalizationPolicy

URI Canonicalizatioon Policy

<bean id="rulesCanonicalizationPolicy" class="org.archive.modules.canonicalize.RulesCanonicalizationPolicy">
  <!-- <property name="rules" value="" /> -->
</bean>
import org.archive.modules.canonicalize.RulesCanonicalizationPolicy

rulesCanonicalizationPolicy(RulesCanonicalizationPolicy) {
    // rules = ''
}
rules
(java.util.List<org.archive.modules.canonicalize.CanonicalizationRule>)

SheetOverlaysManager

Manager which marks-up CrawlURIs with the names of all applicable Sheets, and returns overlay maps by name.

<bean id="sheetOverlaysManager" class="org.archive.crawler.spring.SheetOverlaysManager">
  <!-- <property name="beanFactory" value="" /> -->
  <!-- <property name="sheetsByName" value="" /> -->
</bean>
import org.archive.crawler.spring.SheetOverlaysManager

sheetOverlaysManager(SheetOverlaysManager) {
    // beanFactory = ''
    // sheetsByName = ''
}
beanFactory
(org.springframework.beans.factory.BeanFactory)
sheetsByName
(java.util.Map<java.lang.String,org.archive.spring.Sheet>) Collect all Sheets, by beanName. @param map

StatisticsTracker

This is an implementation of the AbstractTracker. It is designed to functionwith the WUI as well as performing various logging activity.

At the end of each snapshot a line is written to the’progress-statistics.log’ file.

The header of that file is as follows:

 [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]
First there is a timestamp, accurate down to 1 second.

discovered, queued, downloaded and dl-failuresare (respectively) the discovered URI count, pending URI count, successfullyfetched count and failed fetch count from the frontier at the time of thesnapshot.

KB/s(avg) is the bandwidth usage.  We use the total bytes downloadedto calculate average bandwidth usage (KB/sec). Since we also note the valueeach time a snapshot is made we can calculate the average bandwidth usageduring the last snapshot period to gain a “current” rate. The first number isthe current and the average is in parenthesis.

doc/s(avg) works the same way as doc/s except it show the number ofdocuments (URIs) rather then KB downloaded.

busy-threads is the total number of ToeThreads that are not available(and thus presumably busy processing a URI). This information is extractedfrom the crawl controller.

Finally mem-use-KB is extracted from the run time environment(Runtime.getRuntime().totalMemory()).

In addition to the data collected for the above logs, various other datais gathered and stored by this tracker.   Successfully downloaded documents per fetch status code   Successfully downloaded documents per document mime type   Amount of data per mime type   Successfully downloaded documents per host   Amount of data per host   Disposition of all seeds (this is written to ‘reports.log’ at end of       crawl)   Successfully downloaded documents per host per source

<bean id="statisticsTracker" class="org.archive.crawler.reporting.StatisticsTracker">
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="intervalSeconds" value="20" /> -->
  <!-- <property name="keepSnapshotsCount" value="5" /> -->
  <!-- <property name="liveHostReportSize" value="20" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="reports" value="" /> -->
  <!-- <property name="reportsDir" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="trackSeeds" value="true" /> -->
  <!-- <property name="trackSources" value="true" /> -->
</bean>
import org.archive.crawler.reporting.StatisticsTracker

statisticsTracker(StatisticsTracker) {
    // applicationContext = ''
    // bdbModule = ''
    // crawlController = ''
    // intervalSeconds = 20
    // keepSnapshotsCount = 5
    // liveHostReportSize = 20
    // recoveryCheckpoint = ''
    // reports = ''
    // reportsDir = ''
    // seeds = ''
    // serverCache = ''
    // trackSeeds = true
    // trackSources = true
}
applicationContext
(org.springframework.context.ApplicationContext)
bdbModule
(org.archive.bdb.BdbModule)
crawlController
(org.archive.crawler.framework.CrawlController)
intervalSeconds
(int) The interval between writing progress information to log.
keepSnapshotsCount
(int) Number of crawl-stat sample snapshots to keep for calculation purposes.
liveHostReportSize
(int)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
reports
(java.util.List<org.archive.crawler.reporting.Report>)
reportsDir
(org.archive.spring.ConfigPath)
seeds
(org.archive.modules.seeds.SeedModule)
serverCache
(org.archive.modules.net.ServerCache)
trackSeeds
(boolean) Whether to maintain seed disposition records (expensive in crawls with millions of seeds)
trackSources
(boolean) Whether to maintain hosts-per-source-tag records for; very expensive in crawls with large numbers of source-tags (seeds) or large crawls over many hosts

TextSeedModule

Module that announces a list of seeds from a text source (suchas a ConfigFile or ConfigString), and provides a mechanism foradding seeds after a crawl has begun.

<bean id="textSeedModule" class="org.archive.modules.seeds.TextSeedModule">
  <!-- <property name="blockAwaitingSeedLines" value="-1" /> -->
  <!-- <property name="seedListeners" value="" /> -->
  <!-- <property name="sourceTagSeeds" value="false" /> -->
  <!-- <property name="textSource" value="" /> -->
</bean>
import org.archive.modules.seeds.TextSeedModule

textSeedModule(TextSeedModule) {
    // blockAwaitingSeedLines = -1
    // seedListeners = ''
    // sourceTagSeeds = false
    // textSource = ''
}
blockAwaitingSeedLines
(int) Number of lines of seeds-source to read on initial load before proceedingwith crawl. Default is -1, meaning all. Any other value will cause thatnumber of lines to be loaded before fetching begins, while all extralines continue to be processed in the background. Generally, this shouldonly be changed when working with very large seed lists, and scopes thatdo *not* depend on reading all seeds.
seedListeners
(java.util.Set<org.archive.modules.seeds.SeedListener>)
sourceTagSeeds
(boolean) Whether to tag seeds with their own URI as a heritable ‘source’ String,which will be carried-forward to all URIs discovered on paths originatingfrom that seed. When present, such source tags appear in thesecond-to-last crawl.log field.
textSource
(org.archive.io.ReadSource) Text from which to extract seeds

Decide Rules

AcceptDecideRule

<bean id="acceptDecideRule" class="org.archive.modules.deciderules.AcceptDecideRule">
</bean>
import org.archive.modules.deciderules.AcceptDecideRule

acceptDecideRule(AcceptDecideRule) {
}

ClassKeyMatchesRegexDecideRule

Rule applies configured decision to any CrawlURI class key – i.e.CrawlURI#getClassKey() – matches matches supplied regex.

<bean id="classKeyMatchesRegexDecideRule" class="org.archive.crawler.deciderules.ClassKeyMatchesRegexDecideRule">
  <!-- <property name="crawlController" value="" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.crawler.deciderules.ClassKeyMatchesRegexDecideRule

classKeyMatchesRegexDecideRule(ClassKeyMatchesRegexDecideRule) {
    // crawlController = ''
    // decision = ''
    // regex = ''
}
crawlController
(org.archive.crawler.framework.CrawlController)
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

ContentLengthDecideRule

<bean id="contentLengthDecideRule" class="org.archive.modules.deciderules.ContentLengthDecideRule">
  <!-- <property name="contentLengthThreshold" value="" /> -->
</bean>
import org.archive.modules.deciderules.ContentLengthDecideRule

contentLengthDecideRule(ContentLengthDecideRule) {
    // contentLengthThreshold = 0
}
contentLengthThreshold
(long) Content-length threshold.  The rule returns ACCEPT if the content-lengthis less than this threshold, or REJECT otherwise.  The default is2^63, meaning any document will be accepted.

ContentTypeMatchesRegexDecideRule

DecideRule whose decision is applied if the URI’s content-type is present and matches the supplied regular expression.

<bean id="contentTypeMatchesRegexDecideRule" class="org.archive.modules.deciderules.ContentTypeMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.ContentTypeMatchesRegexDecideRule

contentTypeMatchesRegexDecideRule(ContentTypeMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

ContentTypeNotMatchesRegexDecideRule

DecideRule whose decision is applied if the URI’s content-type is present and does not match the supplied regular expression.

<bean id="contentTypeNotMatchesRegexDecideRule" class="org.archive.modules.deciderules.ContentTypeNotMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.ContentTypeNotMatchesRegexDecideRule

contentTypeNotMatchesRegexDecideRule(ContentTypeNotMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

ExpressionDecideRule (contrib)

Example usage:

 <bean class="org.archive.modules.deciderules.ExpressionDecideRule">    <property name="groovyExpression" value='curi.via == null &amp;&amp; curi ==~ "^https?://(?:www\\.)?(facebook|vimeo|flickr)\\.com/.*"'/></bean>

<bean id="expressionDecideRule" class="org.archive.modules.deciderules.ExpressionDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="groovyExpression" value="" /> -->
</bean>
import org.archive.modules.deciderules.ExpressionDecideRule

expressionDecideRule(ExpressionDecideRule) {
    // decision = ''
    // groovyExpression = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
groovyExpression
(java.lang.String)

ExternalGeoLocationDecideRule

A rule that can be configured to take alternate implementationsof the ExternalGeoLocationInterface.If no implementation specified, or none found, returns configured decision.If host in URI has been resolved checks CrawlHost for the country codedetermination.If country code is not present, does country lookup, and saves the countrycode to CrawlHost for future consultation.If country code is present in CrawlHost, compares it againstthe configured code.Note that if a host’s IP address changes during the crawl, we still considerthe associated hostname to be in the country of its original IP address.

<bean id="externalGeoLocationDecideRule" class="org.archive.modules.deciderules.ExternalGeoLocationDecideRule">
  <!-- <property name="countryCodes" value="" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="lookup" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.modules.deciderules.ExternalGeoLocationDecideRule

externalGeoLocationDecideRule(ExternalGeoLocationDecideRule) {
    // countryCodes = ''
    // decision = ''
    // lookup = ''
    // serverCache = ''
}
countryCodes
(java.util.List<java.lang.String>) Country code name.
decision
(org.archive.modules.deciderules.DecideResult)
lookup
(org.archive.modules.deciderules.ExternalGeoLookupInterface)
serverCache
(org.archive.modules.net.ServerCache)

FetchStatusDecideRule

Rule applies the configured decision for any URI which has afetch status equal to the ‘target-status’ setting.

<bean id="fetchStatusDecideRule" class="org.archive.modules.deciderules.FetchStatusDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="statusCodes" value="" /> -->
</bean>
import org.archive.modules.deciderules.FetchStatusDecideRule

fetchStatusDecideRule(FetchStatusDecideRule) {
    // decision = ''
    // statusCodes = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
statusCodes
(java.util.List<java.lang.Integer>)

FetchStatusMatchesRegexDecideRule

<bean id="fetchStatusMatchesRegexDecideRule" class="org.archive.modules.deciderules.FetchStatusMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.FetchStatusMatchesRegexDecideRule

fetchStatusMatchesRegexDecideRule(FetchStatusMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

FetchStatusNotMatchesRegexDecideRule

<bean id="fetchStatusNotMatchesRegexDecideRule" class="org.archive.modules.deciderules.FetchStatusNotMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.FetchStatusNotMatchesRegexDecideRule

fetchStatusNotMatchesRegexDecideRule(FetchStatusNotMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

HasViaDecideRule

Rule applies the configured decision for any URI which has a ‘via’ (essentially, any URI that was a seed or some kinds of mid-crawl adds).

<bean id="hasViaDecideRule" class="org.archive.modules.deciderules.HasViaDecideRule">
  <!-- <property name="decision" value="" /> -->
</bean>
import org.archive.modules.deciderules.HasViaDecideRule

hasViaDecideRule(HasViaDecideRule) {
    // decision = ''
}
decision
(org.archive.modules.deciderules.DecideResult)

HopCrossesAssignmentLevelDomainDecideRule

Applies its decision if the current URI differs in that portion ofits hostname/domain that is assigned/sold by registrars, its’assignment-level-domain’ (ALD) (AKA ‘public suffix’ or in previous Heritrix versions, ‘topmost assigned SURT’)

<bean id="hopCrossesAssignmentLevelDomainDecideRule" class="org.archive.modules.deciderules.HopCrossesAssignmentLevelDomainDecideRule">
  <!-- <property name="decision" value="" /> -->
</bean>
import org.archive.modules.deciderules.HopCrossesAssignmentLevelDomainDecideRule

hopCrossesAssignmentLevelDomainDecideRule(HopCrossesAssignmentLevelDomainDecideRule) {
    // decision = ''
}
decision
(org.archive.modules.deciderules.DecideResult)

HopsPathMatchesRegexDecideRule

Rule applies configured decision to any CrawlURIs whose ‘hops-path’(string like “LLXE” etc.) matches the supplied regex.

<bean id="hopsPathMatchesRegexDecideRule" class="org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule

hopsPathMatchesRegexDecideRule(HopsPathMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

IdenticalDigestDecideRule

Rule applies configured decision to any CrawlURIs whose revisit profile is set with a profile matchingWARCConstants#PROFILE_REVISIT_IDENTICAL_DIGEST

<bean id="identicalDigestDecideRule" class="org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule">
  <!-- <property name="decision" value="" /> -->
</bean>
import org.archive.modules.deciderules.recrawl.IdenticalDigestDecideRule

identicalDigestDecideRule(IdenticalDigestDecideRule) {
    // decision = ''
}
decision
(org.archive.modules.deciderules.DecideResult)

IpAddressSetDecideRule

IpAddressSetDecideRule must be used withorg.archive.crawler.prefetch.Preselector#setRecheckScope(boolean) setto true because it relies on Heritrix’ dns lookup to establish the ip addressfor a URI before it can run.

<bean class="org.archive.modules.deciderules.IpAddressSetDecideRule"> <property name="ipAddresses">  <set>   <value>127.0.0.1</value>   <value>69.89.27.209</value>  </set> </property> <property name='decision' value='REJECT' /></bean>

<bean id="ipAddressSetDecideRule" class="org.archive.modules.deciderules.IpAddressSetDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="ipAddresses" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.modules.deciderules.IpAddressSetDecideRule

ipAddressSetDecideRule(IpAddressSetDecideRule) {
    // decision = ''
    // ipAddresses = ''
    // serverCache = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
ipAddresses
(java.util.Set<java.lang.String>) @param ipAddresses the addresses to match
serverCache
(org.archive.modules.net.ServerCache)

MatchesFilePatternDecideRule

Compares suffix of a passed CrawlURI, UURI, or String against a regularexpression pattern, applying its configured decision to all matches.

Several predefined patterns are available for convenience. Choosing’custom’ makes this the same as a regular MatchesRegexDecideRule.

<bean id="matchesFilePatternDecideRule" class="org.archive.modules.deciderules.MatchesFilePatternDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
  <!-- <property name="usePreset" value="" /> -->
</bean>
import org.archive.modules.deciderules.MatchesFilePatternDecideRule

matchesFilePatternDecideRule(MatchesFilePatternDecideRule) {
    // decision = ''
    // regex = ''
    // usePreset = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)
usePreset
(org.archive.modules.deciderules.MatchesFilePatternDecideRule.Preset)

MatchesListRegexDecideRule

Rule applies configured decision to any CrawlURIs whose String URImatches the supplied regexs.

The list of regular expressions can be considered logically AND or OR.

<bean id="matchesListRegexDecideRule" class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="listLogicalOr" value="true" /> -->
  <!-- <property name="regexList" value="" /> -->
  <!-- <property name="timeoutPerRegexSeconds" value="0" /> -->
</bean>
import org.archive.modules.deciderules.MatchesListRegexDecideRule

matchesListRegexDecideRule(MatchesListRegexDecideRule) {
    // decision = ''
    // listLogicalOr = true
    // regexList = ''
    // timeoutPerRegexSeconds = 0
}
decision
(org.archive.modules.deciderules.DecideResult)
listLogicalOr
(boolean) True if the list of regular expression should be considered as logicallyAND when matching. False if the list of regular expressions should beconsidered as logically OR when matching.
regexList
(java.util.List<java.util.regex.Pattern>) The list of regular expressions to evalute against the URI.
timeoutPerRegexSeconds
(long) The timeout for regular expression matching, in seconds. If set to 0 or negative then no timeout is specified andthere is no upper limit to how long the matching may take. See the corresponding test class MatchesListRegexDecideRuleTestfor a pathological example.

MatchesRegexDecideRule

Rule applies configured decision to any CrawlURIs whose String URImatches the supplied regex.

<bean id="matchesRegexDecideRule" class="org.archive.modules.deciderules.MatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.MatchesRegexDecideRule

matchesRegexDecideRule(MatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

MatchesStatusCodeDecideRule

Provides a rule that returns “true” for any CrawlURIs which have a fetchstatus code that falls within the provided inclusive range. For instance, toselect only URIs with a “success” status code you must provide the range 200to 299.

<bean id="matchesStatusCodeDecideRule" class="org.archive.modules.deciderules.MatchesStatusCodeDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="lowerBound" value="" /> -->
  <!-- <property name="upperBound" value="" /> -->
</bean>
import org.archive.modules.deciderules.MatchesStatusCodeDecideRule

matchesStatusCodeDecideRule(MatchesStatusCodeDecideRule) {
    // decision = ''
    // lowerBound = 0
    // upperBound = 0
}
decision
(org.archive.modules.deciderules.DecideResult)
lowerBound
(java.lang.Integer) Sets the lower bound on the range of acceptable status codes.

@param statusCode Status code

upperBound
(java.lang.Integer) Sets the upper bound on the range of acceptable status codes.

@param statusCode Status code

NotMatchesFilePatternDecideRule

Rule applies configured decision to any URIs which do *not*match the supplied (file-pattern) regex.

<bean id="notMatchesFilePatternDecideRule" class="org.archive.modules.deciderules.NotMatchesFilePatternDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
  <!-- <property name="usePreset" value="" /> -->
</bean>
import org.archive.modules.deciderules.NotMatchesFilePatternDecideRule

notMatchesFilePatternDecideRule(NotMatchesFilePatternDecideRule) {
    // decision = ''
    // regex = ''
    // usePreset = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)
usePreset
(org.archive.modules.deciderules.MatchesFilePatternDecideRule.Preset)

NotMatchesListRegexDecideRule

Rule applies configured decision to any URIs which do *not*match the supplied regex.

<bean id="notMatchesListRegexDecideRule" class="org.archive.modules.deciderules.NotMatchesListRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="listLogicalOr" value="true" /> -->
  <!-- <property name="regexList" value="" /> -->
  <!-- <property name="timeoutPerRegexSeconds" value="0" /> -->
</bean>
import org.archive.modules.deciderules.NotMatchesListRegexDecideRule

notMatchesListRegexDecideRule(NotMatchesListRegexDecideRule) {
    // decision = ''
    // listLogicalOr = true
    // regexList = ''
    // timeoutPerRegexSeconds = 0
}
decision
(org.archive.modules.deciderules.DecideResult)
listLogicalOr
(boolean) True if the list of regular expression should be considered as logicallyAND when matching. False if the list of regular expressions should beconsidered as logically OR when matching.
regexList
(java.util.List<java.util.regex.Pattern>) The list of regular expressions to evalute against the URI.
timeoutPerRegexSeconds
(long) The timeout for regular expression matching, in seconds. If set to 0 or negative then no timeout is specified andthere is no upper limit to how long the matching may take. See the corresponding test class MatchesListRegexDecideRuleTestfor a pathological example.

NotMatchesRegexDecideRule

Rule applies configured decision to any URIs which do *not*match the supplied regex.

<bean id="notMatchesRegexDecideRule" class="org.archive.modules.deciderules.NotMatchesRegexDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="regex" value="" /> -->
</bean>
import org.archive.modules.deciderules.NotMatchesRegexDecideRule

notMatchesRegexDecideRule(NotMatchesRegexDecideRule) {
    // decision = ''
    // regex = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
regex
(java.util.regex.Pattern)

NotMatchesStatusCodeDecideRule

Provides a rule that returns “true” for any CrawlURIs which has a fetchstatus code that does not fall within the provided inclusive range. Forinstance, to reject any URIs with a “client error” status code you mustprovide the range 400 to 499.

<bean id="notMatchesStatusCodeDecideRule" class="org.archive.modules.deciderules.NotMatchesStatusCodeDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="lowerBound" value="" /> -->
  <!-- <property name="upperBound" value="" /> -->
  <!-- <property name="upperBound" value="" /> -->
</bean>
import org.archive.modules.deciderules.NotMatchesStatusCodeDecideRule

notMatchesStatusCodeDecideRule(NotMatchesStatusCodeDecideRule) {
    // decision = ''
    // lowerBound = 0
    // upperBound = 0
    // upperBound = 0
}
decision
(org.archive.modules.deciderules.DecideResult)
lowerBound
(java.lang.Integer) Sets the lower bound on the range of acceptable status codes.

@param statusCode Status code

upperBound
(java.lang.Integer) Sets the upper bound on the range of acceptable status codes.
upperBound
(java.lang.Integer) Sets the upper bound on the range of acceptable status codes.

@param statusCode Status code

NotOnDomainsDecideRule

Rule applies configured decision to any URIs that are*not* in one of the domains in the configured set ofdomains, filled from the seed set.

<bean id="notOnDomainsDecideRule" class="org.archive.modules.deciderules.surt.NotOnDomainsDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.NotOnDomainsDecideRule

notOnDomainsDecideRule(NotOnDomainsDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

NotOnHostsDecideRule

Rule applies configured decision to any URIs thatare *not* on one of the hosts in the configured set ofhosts, filled from the seed set.

<bean id="notOnHostsDecideRule" class="org.archive.modules.deciderules.surt.NotOnHostsDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.NotOnHostsDecideRule

notOnHostsDecideRule(NotOnHostsDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

NotSurtPrefixedDecideRule

Rule applies configured decision to any URIs that, when expressed in SURT form, do *not* begin with one of the prefixesin the configured set.

The set can be filled with SURT prefixes implied orlisted in the seeds file, or another external file.

<bean id="notSurtPrefixedDecideRule" class="org.archive.modules.deciderules.surt.NotSurtPrefixedDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.NotSurtPrefixedDecideRule

notSurtPrefixedDecideRule(NotSurtPrefixedDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

OnDomainsDecideRule

Rule applies configured decision to any URIs thatare on one of the domains in the configured set ofdomains, filled from the seed set.

<bean id="onDomainsDecideRule" class="org.archive.modules.deciderules.surt.OnDomainsDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.OnDomainsDecideRule

onDomainsDecideRule(OnDomainsDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

OnHostsDecideRule

Rule applies configured decision to any URIs thatare on one of the hosts in the configured set ofhosts, filled from the seed set.

<bean id="onHostsDecideRule" class="org.archive.modules.deciderules.surt.OnHostsDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.OnHostsDecideRule

onHostsDecideRule(OnHostsDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

PathologicalPathDecideRule

Rule REJECTs any URI which contains an excessive number of identical, consecutive path-segments (eg http://example.com/a/a/a/boo.html == 3 ‘/a’ segments)

<bean id="pathologicalPathDecideRule" class="org.archive.modules.deciderules.PathologicalPathDecideRule">
  <!-- <property name="maxRepetitions" value="2" /> -->
</bean>
import org.archive.modules.deciderules.PathologicalPathDecideRule

pathologicalPathDecideRule(PathologicalPathDecideRule) {
    // maxRepetitions = 2
}
maxRepetitions
(int) Number of times the pattern should be allowed to occur. This rule returnsits decision (usually REJECT) if a path-segment is repeated more thannumber of times.

PredicatedDecideRule

Rule which applies the configured decision only if a test evaluates to true. Subclasses override evaluate()to establish the test.

<bean id="predicatedDecideRule" class="org.archive.modules.deciderules.PredicatedDecideRule">
  <!-- <property name="decision" value="" /> -->
</bean>
import org.archive.modules.deciderules.PredicatedDecideRule

predicatedDecideRule(PredicatedDecideRule) {
    // decision = ''
}
decision
(org.archive.modules.deciderules.DecideResult)

PrerequisiteAcceptDecideRule

Rule which ACCEPTs all ‘prerequisite’ URIs (those with a ‘P’ inthe last hopsPath position). Good in a late position to ensureother scope settings don’t lock out necessary prerequisites.

<bean id="prerequisiteAcceptDecideRule" class="org.archive.modules.deciderules.PrerequisiteAcceptDecideRule">
</bean>
import org.archive.modules.deciderules.PrerequisiteAcceptDecideRule

prerequisiteAcceptDecideRule(PrerequisiteAcceptDecideRule) {
}

RejectDecideRule

<bean id="rejectDecideRule" class="org.archive.modules.deciderules.RejectDecideRule">
</bean>
import org.archive.modules.deciderules.RejectDecideRule

rejectDecideRule(RejectDecideRule) {
}

ResourceLongerThanDecideRule

Applies configured decision for URIs with content length greater thana given threshold length value. Examines either HTTP header Content-Lengthor actual downloaded content length (based on the useHeaderLength property), and has no effect on resources shorter than or equal to the given threshold value.

Note that because neither the Content-Length header nor the actual size areavailable at URI-scoping time, this rule is unusable in crawl scopes. Instead, the earliest it can be used is as a mid-fetch rule (in FetchHTTP), when the headers are available but not yet the body. It can also be used to affect processing after the URI is fully fetched.

<bean id="resourceLongerThanDecideRule" class="org.archive.modules.deciderules.ResourceLongerThanDecideRule">
  <!-- <property name="contentLengthThreshold" value="-1" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="useHeaderLength" value="true" /> -->
</bean>
import org.archive.modules.deciderules.ResourceLongerThanDecideRule

resourceLongerThanDecideRule(ResourceLongerThanDecideRule) {
    // contentLengthThreshold = -1
    // decision = ''
    // useHeaderLength = true
}
contentLengthThreshold
(long) Max content-length this filter will allow to pass through. If -1,then no limit.
decision
(org.archive.modules.deciderules.DecideResult)
useHeaderLength
(boolean) Shall this rule be used as a midfetch rule? If true, this rule willdetermine content length based on HTTP header information, otherwisethe size of the already downloaded content will be used.

ResourceNoLongerThanDecideRule

Applies configured decision for URIs with content length less than or equalto a given threshold length value. Examines either HTTP header Content-Lengthor actual downloaded content length (based on the useHeaderLength property), and has no effect on resources longer than the given threshold value.

Note that because neither the Content-Length header nor the actual size areavailable at URI-scoping time, this rule is unusable in crawl scopes. Instead, the earliest it can be used is as a mid-fetch rule (in FetchHTTP), when the headers are available but not yet the body. It can also be used to affect processing after the URI is fully fetched.

<bean id="resourceNoLongerThanDecideRule" class="org.archive.modules.deciderules.ResourceNoLongerThanDecideRule">
  <!-- <property name="contentLengthThreshold" value="-1" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="useHeaderLength" value="true" /> -->
</bean>
import org.archive.modules.deciderules.ResourceNoLongerThanDecideRule

resourceNoLongerThanDecideRule(ResourceNoLongerThanDecideRule) {
    // contentLengthThreshold = -1
    // decision = ''
    // useHeaderLength = true
}
contentLengthThreshold
(long) Max content-length this filter will allow to pass through. If -1,then no limit.
decision
(org.archive.modules.deciderules.DecideResult)
useHeaderLength
(boolean) Shall this rule be used as a midfetch rule? If true, this rule willdetermine content length based on HTTP header information, otherwisethe size of the already downloaded content will be used.

ResponseContentLengthDecideRule

Decide rule that will ACCEPT or REJECT a uri, depending on the”decision” property, after it’s fetched, if the content body is within aspecified size range, specified in bytes.

<bean id="responseContentLengthDecideRule" class="org.archive.modules.deciderules.ResponseContentLengthDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="lowerBound" value="0" /> -->
  <!-- <property name="upperBound" value="" /> -->
</bean>
import org.archive.modules.deciderules.ResponseContentLengthDecideRule

responseContentLengthDecideRule(ResponseContentLengthDecideRule) {
    // decision = ''
    // lowerBound = 0
    // upperBound = 0
}
decision
(org.archive.modules.deciderules.DecideResult)
lowerBound
(long) The rule will apply if the url has been fetched and content body lengthis greater than or equal to this number of bytes. Default is 0, meaningeverything will match.
upperBound
(long) The rule will apply if the url has been fetched and content body lengthis less than or equal to this number of bytes. Default isLong.MAX_VALUE, meaning everything will match.

SchemeNotInSetDecideRule

Rule applies the configured decision (default REJECT) for any URI which has a URI-scheme NOT contained in the configured Set.

<bean id="schemeNotInSetDecideRule" class="org.archive.modules.deciderules.SchemeNotInSetDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="schemes" value="" /> -->
</bean>
import org.archive.modules.deciderules.SchemeNotInSetDecideRule

schemeNotInSetDecideRule(SchemeNotInSetDecideRule) {
    // decision = ''
    // schemes = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
schemes
(java.util.Set<java.lang.String>) set of schemes to test URI scheme

ScriptedDecideRule

Rule which runs a JSR-223 script to make its decision.

Script source may be provided via a file local to the crawler oran inline configuration string.

The source must include a one-argument function “decisionFor” which returns the appropriate DecideResult.

Variables available to the script include ‘object’ (the object to beevaluated, typically a CrawlURI), ‘self’ (this ScriptedDecideRule instance), and ‘context’ (the crawl’s  ApplicationContext, from which all named crawl beans are easily reachable).

by watching for a certain applicationEvent?

<bean id="scriptedDecideRule" class="org.archive.modules.deciderules.ScriptedDecideRule">
  <!-- <property name="applicationContext" value="" /> -->
  <!-- <property name="engineName" value="beanshell" /> -->
  <!-- <property name="isolateThreads" value="true" /> -->
  <!-- <property name="scriptSource" value="" /> -->
</bean>
import org.archive.modules.deciderules.ScriptedDecideRule

scriptedDecideRule(ScriptedDecideRule) {
    // applicationContext = ''
    // engineName = 'beanshell'
    // isolateThreads = true
    // scriptSource = ''
}
applicationContext
(org.springframework.context.ApplicationContext)
engineName
(java.lang.String) engine name; default “beanshell”
isolateThreads
(boolean) Whether each ToeThread should get its own independent script engine, or they should share synchronized access to one engine. Default is true, meaning each thread gets its own isolated engine.
scriptSource
(org.archive.io.ReadSource)

SeedAcceptDecideRule

Rule which ACCEPTs all ‘seed’ URIs (those for which isSeed is true). Good in a late position to ensureother scope settings don’t lock out explicitly addedseeds.

<bean id="seedAcceptDecideRule" class="org.archive.modules.deciderules.SeedAcceptDecideRule">
</bean>
import org.archive.modules.deciderules.SeedAcceptDecideRule

seedAcceptDecideRule(SeedAcceptDecideRule) {
}

SourceSeedDecideRule

Rule applies the configured decision for any URI with discovered from one ofthe seeds in sourceSeeds.

SeedModule#getSourceTagSeeds() must be enabled or the rule will neverapply.

<bean id="sourceSeedDecideRule" class="org.archive.modules.deciderules.SourceSeedDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="sourceSeeds" value="" /> -->
</bean>
import org.archive.modules.deciderules.SourceSeedDecideRule

sourceSeedDecideRule(SourceSeedDecideRule) {
    // decision = ''
    // sourceSeeds = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
sourceSeeds
(java.util.Set<java.lang.String>)

SurtPrefixedDecideRule

Rule applies configured decision to any URIs that, when expressed in SURT form, begin with one of the prefixesin the configured set.

The set can be filled with SURT prefixes implied orlisted in the seeds file, or another external file.

The “also-check-via” option to implement “one hop off” scoping derives from a contribution by Shifra Raffelof the California Digital Library.

<bean id="surtPrefixedDecideRule" class="org.archive.modules.deciderules.surt.SurtPrefixedDecideRule">
  <!-- <property name="alsoCheckVia" value="false" /> -->
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="recoveryCheckpoint" value="" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsAsSurtPrefixes" value="true" /> -->
  <!-- <property name="surtsDumpFile" value="" /> -->
  <!-- <property name="surtsSource" value="" /> -->
  <!-- <property name="surtsSourceFile" value="" /> -->
</bean>
import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule

surtPrefixedDecideRule(SurtPrefixedDecideRule) {
    // alsoCheckVia = false
    // decision = ''
    // recoveryCheckpoint = ''
    // seeds = ''
    // seedsAsSurtPrefixes = true
    // surtsDumpFile = ''
    // surtsSource = ''
    // surtsSourceFile = ''
}
alsoCheckVia
(boolean) Whether to also make the configured decision if a URI’s ‘via’ URI (theURI from which it was discovered) in SURT form begins with any of theestablished prefixes. For example, can be used to ACCEPT URIs that are’one hop off’ URIs fitting the SURT prefixes. Default is false.
decision
(org.archive.modules.deciderules.DecideResult)
recoveryCheckpoint
(org.archive.checkpointing.Checkpoint)
seeds
(org.archive.modules.seeds.SeedModule)
seedsAsSurtPrefixes
(boolean) Should seeds also be interpreted as SURT prefixes.
surtsDumpFile
(org.archive.spring.ConfigFile) Dump file to save SURT prefixes actually used: Useful debugging SURTs.
surtsSource
(org.archive.io.ReadSource) Text from which to infer SURT prefixes. Any URLs will be converted to theimplied SURT prefix, and literal SURT prefixes may be listed on linesbeginning with a ‘+’ character.
surtsSourceFile
(org.archive.spring.ConfigFile) @deprecated

TooManyHopsDecideRule

Rule REJECTs any CrawlURIs whose total number of hops (length of the hopsPath string, traversed links of any type) is over a threshold.Otherwise returns PASS.

<bean id="tooManyHopsDecideRule" class="org.archive.modules.deciderules.TooManyHopsDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="maxHops" value="20" /> -->
</bean>
import org.archive.modules.deciderules.TooManyHopsDecideRule

tooManyHopsDecideRule(TooManyHopsDecideRule) {
    // decision = ''
    // maxHops = 20
}
decision
(org.archive.modules.deciderules.DecideResult)
maxHops
(int) Max path depth for which this filter will match.

TooManyPathSegmentsDecideRule

Rule REJECTs any CrawlURIs whose total number of path-segments (asindicated by the count of ‘/’ characters not including the first ‘//’)is over a given threshold.

<bean id="tooManyPathSegmentsDecideRule" class="org.archive.modules.deciderules.TooManyPathSegmentsDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="maxPathDepth" value="20" /> -->
</bean>
import org.archive.modules.deciderules.TooManyPathSegmentsDecideRule

tooManyPathSegmentsDecideRule(TooManyPathSegmentsDecideRule) {
    // decision = ''
    // maxPathDepth = 20
}
decision
(org.archive.modules.deciderules.DecideResult)
maxPathDepth
(int) Number of path segments beyond which this rule will reject URIs.

TransclusionDecideRule

Rule ACCEPTs any CrawlURIs whose path-from-seed (‘hopsPath’ – seeCrawlURI#getPathFromSeed() ends with at least one, but not more than, the given number of non-navlink (‘L’) hops.

Otherwise, if the path-from-seed is empty or if a navlink (‘L’) occurswithin max-trans-hops of the tail of the path-from-seed, this rulereturns PASS.

Thus, it allows things like embedded resources (frames/images/media) and redirects to be transitively included (‘transcluded’) in a crawl, even if they otherwise would not, for some reasonable number of hops(usually 1-5).

<bean id="transclusionDecideRule" class="org.archive.modules.deciderules.TransclusionDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="maxSpeculativeHops" value="1" /> -->
  <!-- <property name="maxTransHops" value="2" /> -->
</bean>
import org.archive.modules.deciderules.TransclusionDecideRule

transclusionDecideRule(TransclusionDecideRule) {
    // decision = ''
    // maxSpeculativeHops = 1
    // maxTransHops = 2
}
decision
(org.archive.modules.deciderules.DecideResult)
maxSpeculativeHops
(int) Maximum number of speculative (‘X’) hops to ACCEPT.
maxTransHops
(int) Maximum number of non-navlink (non-‘L’) hops to ACCEPT.

ViaSurtPrefixedDecideRule

Rule applies the configured decision for any URI which has a ‘via’ whosesurtform matches any surt specified in the surtPrefixes list

<bean id="viaSurtPrefixedDecideRule" class="org.archive.modules.deciderules.ViaSurtPrefixedDecideRule">
  <!-- <property name="decision" value="" /> -->
  <!-- <property name="surtPrefixes" value="" /> -->
</bean>
import org.archive.modules.deciderules.ViaSurtPrefixedDecideRule

viaSurtPrefixedDecideRule(ViaSurtPrefixedDecideRule) {
    // decision = ''
    // surtPrefixes = ''
}
decision
(org.archive.modules.deciderules.DecideResult)
surtPrefixes
(java.util.List<java.lang.String>)

Candidate Processors

CandidateScoper

Simple single-URI scoper, considers passed-in URI as candidate; sets fetchstatus negative and skips to end of processing if out-of-scope.

<bean id="candidateScoper" class="org.archive.crawler.prefetch.CandidateScoper">
  <!-- <property name="logToFile" value="false" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="scope" value="" /> -->
</bean>
import org.archive.crawler.prefetch.CandidateScoper

candidateScoper(CandidateScoper) {
    // logToFile = false
    // loggerModule = ''
    // scope = ''
}
logToFile
(boolean) If enabled, log decisions to file named logs/{spring-bean-id}.log. Formatis “[timestamp] [decision] [uri]” where decision is ‘ACCEPT’ or ‘REJECT’.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
scope
(org.archive.modules.deciderules.DecideRule)

FrontierPreparer

Processor to preload URI with as much precalculated policy-based info as possible before it reaches frontier critical sections.

Frontiers also maintain a direct reference to this class, in casethey need to perform remedial preparation for URIs that do notpass through this processor on the CandidateChain.

<bean id="frontierPreparer" class="org.archive.crawler.prefetch.FrontierPreparer">
  <!-- <property name="canonicalizationPolicy" value="" /> -->
  <!-- <property name="costAssignmentPolicy" value="" /> -->
  <!-- <property name="logToFile" value="false" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="preferenceDepthHops" value="-1" /> -->
  <!-- <property name="preferenceEmbedHops" value="1" /> -->
  <!-- <property name="queueAssignmentPolicy" value="" /> -->
  <!-- <property name="scope" value="" /> -->
  <!-- <property name="uriPrecedencePolicy" value="" /> -->
</bean>
import org.archive.crawler.prefetch.FrontierPreparer

frontierPreparer(FrontierPreparer) {
    // canonicalizationPolicy = ''
    // costAssignmentPolicy = ''
    // logToFile = false
    // loggerModule = ''
    // preferenceDepthHops = -1
    // preferenceEmbedHops = 1
    // queueAssignmentPolicy = ''
    // scope = ''
    // uriPrecedencePolicy = ''
}
canonicalizationPolicy
(org.archive.modules.canonicalize.UriCanonicalizationPolicy) Ordered list of url canonicalization rules.  Rules are applied in theorder listed from top to bottom.
costAssignmentPolicy
(org.archive.crawler.frontier.CostAssignmentPolicy) cost assignment policy to use.
logToFile
(boolean) If enabled, log decisions to file named logs/{spring-bean-id}.log. Formatis “[timestamp] [decision] [uri]” where decision is ‘ACCEPT’ or ‘REJECT’.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
preferenceDepthHops
(int) Number of hops (of any sort) from a seed up to which a URI has higherpriority scheduling than any remaining seed. For example, if set to 1items one hop (link, embed, redirect, etc.) away from a seed will bescheduled with HIGH priority. If set to -1, no preferencing will occur,and a breadth-first search with seeds processed before discovered linkswill proceed. If set to zero, a purely depth-first search will proceed,with all discovered links processed before remaining seeds. Seedredirects are treated as one hop from a seed.
preferenceEmbedHops
(int) number of hops of embeds (ERX) to bump to front of host queue
queueAssignmentPolicy
(org.archive.crawler.frontier.QueueAssignmentPolicy) Defines how to assign URIs to queues. Can assign by host, by ip,by SURT-ordered authority, by SURT-ordered authority truncated toa topmost-assignable domain, and into one of a fixed set of buckets(1k).
scope
(org.archive.modules.deciderules.DecideRule)
uriPrecedencePolicy
(org.archive.crawler.frontier.precedence.UriPrecedencePolicy) URI precedence assignment policy to use.

Pre-Fetch Processors

PreconditionEnforcer

Ensures the preconditions for a fetch – such as DNS lookup or acquiring and respecting a robots.txt policy – aresatisfied before a URI is passed to subsequent stages.

<bean id="preconditionEnforcer" class="org.archive.crawler.prefetch.PreconditionEnforcer">
  <!-- <property name="calculateRobotsOnly" value="false" /> -->
  <!-- <property name="credentialStore" value="" /> -->
  <!-- <property name="ipValidityDurationSeconds" value="" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="metadata" value="" /> -->
  <!-- <property name="robotsValidityDurationSeconds" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.crawler.prefetch.PreconditionEnforcer

preconditionEnforcer(PreconditionEnforcer) {
    // calculateRobotsOnly = false
    // credentialStore = ''
    // ipValidityDurationSeconds = 0
    // loggerModule = ''
    // metadata = ''
    // robotsValidityDurationSeconds = 0
    // serverCache = ''
}
calculateRobotsOnly
(boolean) Whether to only calculate the robots status of an URI, without actuallyapplying any exclusions found. If true, exlcuded URIs will only beannotated in the crawl.log, but still fetched. Default is false.
credentialStore
(org.archive.modules.credential.CredentialStore)
ipValidityDurationSeconds
(int) The minimum interval for which a dns-record will be consideredvalid (in seconds). If the record’s DNS TTL is larger, that willbe used instead.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
metadata
(org.archive.modules.CrawlMetadata) Auto-discovered module providing configured (or overridden)User-Agent value and RobotsHonoringPolicy
robotsValidityDurationSeconds
(int) The time in seconds that fetched robots.txt information is considered tobe valid. If the value is set to ‘0’, then the robots.txt informationwill never expire.
serverCache
(org.archive.modules.net.ServerCache)

Preselector

If set to recheck the crawl’s scope, gives a yes/no on whethera CrawlURI should be processed at all. If not, its statuswill be marked OUT_OF_SCOPE and the URI will skip directlyto the first “postprocessor”.

<bean id="preselector" class="org.archive.crawler.prefetch.Preselector">
  <!-- <property name="allowByRegex" value="" /> -->
  <!-- <property name="blockAll" value="false" /> -->
  <!-- <property name="blockByRegex" value="" /> -->
  <!-- <property name="logToFile" value="false" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="recheckScope" value="false" /> -->
  <!-- <property name="scope" value="" /> -->
</bean>
import org.archive.crawler.prefetch.Preselector

preselector(Preselector) {
    // allowByRegex = ''
    // blockAll = false
    // blockByRegex = ''
    // logToFile = false
    // loggerModule = ''
    // recheckScope = false
    // scope = ''
}
allowByRegex
(java.lang.String) Allow only URIs matching the regular expression to be processed.
blockAll
(boolean) Block all URIs from being processed. This is most likely to be used inoverrides to easily reject certain hosts from being processed.
blockByRegex
(java.lang.String) Block all URIs matching the regular expression from being processed.
logToFile
(boolean) If enabled, log decisions to file named logs/{spring-bean-id}.log. Formatis “[timestamp] [decision] [uri]” where decision is ‘ACCEPT’ or ‘REJECT’.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
recheckScope
(boolean) Recheck if uri is in scope. This is meaningful if the scope is alteredduring a crawl. URIs are checked against the scope when they are added toqueues. Setting this value to true forces the URI to be checked againstthe scope when it is coming out of the queue, possibly after the scopeis altered.
scope
(org.archive.modules.deciderules.DecideRule)

Fetch Processors

FetchDNS

Processor to resolve ‘dns:’ URIs.

<bean id="fetchDNS" class="org.archive.modules.fetcher.FetchDNS">
  <!-- <property name="acceptNonDnsResolves" value="false" /> -->
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="disableJavaDnsResolves" value="false" /> -->
  <!-- <property name="dnsOverHttpServer" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.modules.fetcher.FetchDNS

fetchDNS(FetchDNS) {
    // acceptNonDnsResolves = false
    // digestAlgorithm = 'sha1'
    // digestContent = true
    // disableJavaDnsResolves = false
    // dnsOverHttpServer = ''
    // serverCache = ''
}
acceptNonDnsResolves
(boolean) If a DNS lookup fails, whether or not to fall back to InetAddressresolution, which may use local ‘hosts’ files or other mechanisms.
digestAlgorithm
(java.lang.String) Which algorithm (for example MD5 or SHA-1) to use to perform anon-the-fly digest hash of retrieved content-bodies.
digestContent
(boolean) Whether or not to perform an on-the-fly digest hash of retrievedcontent-bodies.
disableJavaDnsResolves
(boolean) Optionally, only allow InetAddress resolution, precisely because itmay use local ‘hosts’ files or other mechanisms.

This should not generally be used in production as it will preventDNS lookups from being recorded properly.

dnsOverHttpServer
(java.lang.String) URL to the DNS-on-HTTP(S) server.If this not set or set to an empty string, no DNS-over-HTTP(S)will be used; otherwise if should contain the URL to theDNS-over-HTTPS server.
serverCache
(org.archive.modules.net.ServerCache) Used to do DNS lookups.

FetchFTP

Fetches documents and directory listings using FTP.  This class will alsotry to extract FTP “links” from directory listings.  For this class toarchive a directory listing, the remote FTP server must support the NLISTcommand.  Most modern FTP servers should.

<bean id="fetchFTP" class="org.archive.modules.fetcher.FetchFTP">
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="extractFromDirs" value="true" /> -->
  <!-- <property name="extractParent" value="true" /> -->
  <!-- <property name="maxFetchKBSec" value="0" /> -->
  <!-- <property name="maxLengthBytes" value="0" /> -->
  <!-- <property name="password" value="password" /> -->
  <!-- <property name="soTimeoutMs" value="" /> -->
  <!-- <property name="timeoutSeconds" value="" /> -->
  <!-- <property name="username" value="anonymous" /> -->
</bean>
import org.archive.modules.fetcher.FetchFTP

fetchFTP(FetchFTP) {
    // digestAlgorithm = 'sha1'
    // digestContent = true
    // extractFromDirs = true
    // extractParent = true
    // maxFetchKBSec = 0
    // maxLengthBytes = 0
    // password = 'password'
    // soTimeoutMs = 0
    // timeoutSeconds = 0
    // username = 'anonymous'
}
digestAlgorithm
(java.lang.String) Which algorithm (for example MD5 or SHA-1) to use to perform anon-the-fly digest hash of retrieved content-bodies.
digestContent
(boolean) Whether or not to perform an on-the-fly digest hash of retrievedcontent-bodies.
extractFromDirs
(boolean) Set to true to extract further URIs from FTP directories. Default istrue.
extractParent
(boolean) Set to true to extract the parent URI from all FTP URIs. Default is true.
maxFetchKBSec
(int) The maximum KB/sec to use when fetching data from a server. The defaultof 0 means no maximum.
maxLengthBytes
(long) Maximum length in bytes to fetch. Fetch is truncated at this length. Avalue of 0 means no limit.
password
(java.lang.String) The password to send to FTP servers. By convention, anonymous users sendtheir email address in this field.
soTimeoutMs
(int) If the socket is unresponsive for this number of milliseconds, give up.Set to zero for no timeout (Not. recommended. Could hang a thread on anunresponsive server). This timeout is used timing out socket opens andfor timing out each socket read. Make sure this value is <#getTimeoutSeconds() for optimal configuration: ensures at least oneretry read.
timeoutSeconds
(int) If the fetch is not completed in this number of seconds, give up (andretry later).
username
(java.lang.String) The username to send to FTP servers. By convention, the default value of”anonymous” is used for publicly available FTP sites.

FetchHTTP

HTTP fetcher that uses Apache HttpComponents.

<bean id="fetchHTTP" class="org.archive.modules.fetcher.FetchHTTP">
  <!-- <property name="acceptCompression" value="false" /> -->
  <!-- <property name="acceptHeaders" value="" /> -->
  <!-- <property name="cookieStore" value="" /> -->
  <!-- <property name="credentialStore" value="" /> -->
  <!-- <property name="defaultEncoding" value="ISO-8859-1" /> -->
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="httpBindAddress" value="" /> -->
  <!-- <property name="httpProxyHost" value="" /> -->
  <!-- <property name="httpProxyPassword" value="" /> -->
  <!-- <property name="httpProxyPort" value="" /> -->
  <!-- <property name="httpProxyUser" value="" /> -->
  <!-- <property name="ignoreCookies" value="false" /> -->
  <!-- <property name="maxFetchKBSec" value="0" /> -->
  <!-- <property name="maxLengthBytes" value="0" /> -->
  <!-- <property name="sendConnectionClose" value="true" /> -->
  <!-- <property name="sendIfModifiedSince" value="true" /> -->
  <!-- <property name="sendIfNoneMatch" value="true" /> -->
  <!-- <property name="sendRange" value="false" /> -->
  <!-- <property name="sendReferer" value="true" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="shouldFetchBodyRule" value="" /> -->
  <!-- <property name="soTimeoutMs" value="" /> -->
  <!-- <property name="socksProxyHost" value="" /> -->
  <!-- <property name="socksProxyPort" value="" /> -->
  <!-- <property name="sslTrustLevel" value="" /> -->
  <!-- <property name="timeoutSeconds" value="" /> -->
  <!-- <property name="useHTTP11" value="false" /> -->
  <!-- <property name="userAgentProvider" value="" /> -->
</bean>
import org.archive.modules.fetcher.FetchHTTP

fetchHTTP(FetchHTTP) {
    // acceptCompression = false
    // acceptHeaders = ''
    // cookieStore = ''
    // credentialStore = ''
    // defaultEncoding = 'ISO-8859-1'
    // digestAlgorithm = 'sha1'
    // digestContent = true
    // httpBindAddress = ''
    // httpProxyHost = ''
    // httpProxyPassword = ''
    // httpProxyPort = 0
    // httpProxyUser = ''
    // ignoreCookies = false
    // maxFetchKBSec = 0
    // maxLengthBytes = 0
    // sendConnectionClose = true
    // sendIfModifiedSince = true
    // sendIfNoneMatch = true
    // sendRange = false
    // sendReferer = true
    // serverCache = ''
    // shouldFetchBodyRule = ''
    // soTimeoutMs = 0
    // socksProxyHost = ''
    // socksProxyPort = 0
    // sslTrustLevel = ''
    // timeoutSeconds = 0
    // useHTTP11 = false
    // userAgentProvider = ''
}
acceptCompression
(boolean) Set headers to accept compressed responses.
acceptHeaders
(java.util.List<java.lang.String>) Accept Headers to include in each request. Each must be the completeheader, e.g., ‘Accept-Language: en’. (Thus, this can also be used toother headers not beginning ‘Accept-’ as well.) By default heritrix sendsan Accept header similar to what a typical browser would send (the valuecomes from Firefox 4.0).
cookieStore
(org.archive.modules.fetcher.AbstractCookieStore)
credentialStore
(org.archive.modules.credential.CredentialStore) Used to store credentials.
defaultEncoding
(java.lang.String) The character encoding to use for files that do not have one specified inthe HTTP response headers. Default: ISO-8859-1.
digestAlgorithm
(java.lang.String) Which algorithm (for example MD5 or SHA-1) to use to perform anon-the-fly digest hash of retrieved content-bodies.
digestContent
(boolean) Whether or not to perform an on-the-fly digest hash of retrievedcontent-bodies.
httpBindAddress
(java.lang.String) Local IP address or hostname to use when making connections (bindingsockets). When not specified, uses default local address(es).
httpProxyHost
(java.lang.String) Proxy host IP (set only if needed).
httpProxyPassword
(java.lang.String) Proxy password (set only if needed).
httpProxyPort
(java.lang.Integer) Proxy port (set only if needed).
httpProxyUser
(java.lang.String) Proxy user (set only if needed).
ignoreCookies
(boolean) Disable cookie handling.
maxFetchKBSec
(int) The maximum KB/sec to use when fetching data from a server. The defaultof 0 means no maximum.
maxLengthBytes
(long) Maximum length in bytes to fetch. Fetch is truncated at this length. Avalue of 0 means no limit.
sendConnectionClose
(boolean) Send ‘Connection: close’ header with every request.
sendIfModifiedSince
(boolean) Send ‘If-Modified-Since’ header, if previous ‘Last-Modified’ fetchhistory information is available in URI history.
sendIfNoneMatch
(boolean) Send ‘If-None-Match’ header, if previous ‘Etag’ fetch history informationis available in URI history.
sendRange
(boolean) Send ‘Range’ header when a limit (#setMaxLengthBytes(long)) ondocument size.

Be polite to the HTTP servers and send the ‘Range’ header, stating thatyou are only interested in the first n bytes. Only pertinent if#getMaxLengthBytes() > 0. Sending the ‘Range’ header results in a’206 Partial Content’ status response, which is better than just cuttingthe response mid-download. On rare occasion, sending ‘Range’ willgenerate ‘416 Request Range Not Satisfiable’ response.

sendReferer
(boolean) Send ‘Referer’ header with every request.

The ‘Referer’ header contans the location the crawler came from, the pagethe current URI was discovered in. The ‘Referer’ usually is logged on theremote server and can be of assistance to webmasters trying to figure howa crawler got to a particular area on a site.

serverCache
(org.archive.modules.net.ServerCache) Used to do DNS lookups.
shouldFetchBodyRule
(org.archive.modules.deciderules.DecideRule) DecideRules applied after receipt of HTTP response headers but before westart to download the body. If any filter returns FALSE, the fetch isaborted. Prerequisites such as robots.txt by-pass filtering (i.e. theycannot be midfetch aborted.
soTimeoutMs
(int) If the socket is unresponsive for this number of milliseconds, give up.Set to zero for no timeout (Not. recommended. Could hang a thread on anunresponsive server). This timeout is used timing out socket opens andfor timing out each socket read. Make sure this value is <#getTimeoutSeconds() for optimal configuration: ensures at least oneretry read.
socksProxyHost
(java.lang.String) Sets a SOCKS5 proxy host to use. This will override any set HTTP proxy.
socksProxyPort
(java.lang.Integer) Sets a SOCKS5 proxy port to use.
sslTrustLevel
(org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel) SSL certificate trust level. Range is from the default ‘open’ (trust allcerts including expired, selfsigned, and those for which we do not have aCA) through ‘loose’ (trust all valid certificates including selfsigned),’normal’ (all valid certificates not including selfsigned) to ‘strict’(Cert is valid and DN must match servername).
timeoutSeconds
(int) If the fetch is not completed in this number of seconds, give up (andretry later).
useHTTP11
(boolean) Use HTTP/1.1. Note: even when offering an HTTP/1.1 request, Heritrix may not properly handle persistent/keep-alive connections, so the sendConnectionClose parameter should remain ‘true’.
userAgentProvider
(org.archive.modules.fetcher.UserAgentProvider)

FetchHTTP2

HTTP Fetcher that uses Jetty HttpClient to support HTTP/2 and HTTP/3.

Does not record the original on-the-wire HTTP messages but instead a simplified HTTP/1.1representation without transfer encoding.

Note: The WARC standard (as of version 1.1) does not specify how to record HTTP/2 or HTTP/3 messages.If you want to stay within the bounds of the base WARC standard without extensions, or want to ensure the exactbytes of the HTTP network message are recorded, you may prefer to use FetchHTTP.

<bean id="fetchHTTP2" class="org.archive.modules.fetcher.FetchHTTP2">
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
  <!-- <property name="httpProxyHost" value="" /> -->
  <!-- <property name="httpProxyPort" value="" /> -->
  <!-- <property name="maxFetchKBSec" value="0" /> -->
  <!-- <property name="maxLengthBytes" value="0" /> -->
  <!-- <property name="socksProxyHost" value="" /> -->
  <!-- <property name="socksProxyPassword" value="" /> -->
  <!-- <property name="socksProxyPort" value="" /> -->
  <!-- <property name="socksProxyUsername" value="" /> -->
  <!-- <property name="timeoutSeconds" value="20" /> -->
  <!-- <property name="useHTTP2" value="true" /> -->
  <!-- <property name="useHTTP3" value="false" /> -->
  <!-- <property name="userAgentProvider" value="" /> -->
</bean>
import org.archive.modules.fetcher.FetchHTTP2

fetchHTTP2(FetchHTTP2) {
    // digestAlgorithm = 'sha1'
    // httpProxyHost = ''
    // httpProxyPort = 0
    // maxFetchKBSec = 0
    // maxLengthBytes = 0
    // socksProxyHost = ''
    // socksProxyPassword = ''
    // socksProxyPort = 0
    // socksProxyUsername = ''
    // timeoutSeconds = 20
    // useHTTP2 = true
    // useHTTP3 = false
    // userAgentProvider = ''
}
digestAlgorithm
(java.lang.String)
httpProxyHost
(java.lang.String) Proxy host IP (set only if needed).
httpProxyPort
(java.lang.Integer) Proxy port (set only if needed).
maxFetchKBSec
(int) The maximum KB/sec to use when fetching data from a server. The defaultof 0 means no maximum.
maxLengthBytes
(long) Maximum length in bytes to fetch. Fetch is truncated at this length. Avalue of 0 means no limit.
socksProxyHost
(java.lang.String) Sets a SOCKS5 proxy host to use. This will override any set HTTP proxy.
socksProxyPassword
(java.lang.String) Sets a SOCKS5 proxy password to use (enables username/password authentication).
socksProxyPort
(java.lang.Integer) Sets a SOCKS5 proxy port to use.
socksProxyUsername
(java.lang.String) Sets a SOCKS5 proxy username to use (enables username/password authentication).
timeoutSeconds
(int)
useHTTP2
(boolean) Configures whether the HTTP/2 protocol is enabled.
useHTTP3
(boolean) Configures whether HTTP/3 protocol should be enabled. Currently experimental and not enabled by default.
userAgentProvider
(org.archive.modules.fetcher.UserAgentProvider)

FetchSFTP

<bean id="fetchSFTP" class="org.archive.modules.fetcher.FetchSFTP">
  <!-- <property name="digestAlgorithm" value="sha1" /> -->
  <!-- <property name="digestContent" value="true" /> -->
  <!-- <property name="extractFromDirs" value="true" /> -->
  <!-- <property name="extractParent" value="true" /> -->
  <!-- <property name="maxFetchKBSec" value="0" /> -->
  <!-- <property name="maxLengthBytes" value="0" /> -->
  <!-- <property name="password" value="password" /> -->
  <!-- <property name="soTimeoutMs" value="" /> -->
  <!-- <property name="timeoutSeconds" value="" /> -->
  <!-- <property name="username" value="anonymous" /> -->
</bean>
import org.archive.modules.fetcher.FetchSFTP

fetchSFTP(FetchSFTP) {
    // digestAlgorithm = 'sha1'
    // digestContent = true
    // extractFromDirs = true
    // extractParent = true
    // maxFetchKBSec = 0
    // maxLengthBytes = 0
    // password = 'password'
    // soTimeoutMs = 0
    // timeoutSeconds = 0
    // username = 'anonymous'
}
digestAlgorithm
(java.lang.String) Which algorithm (for example MD5 or SHA-1) to use to perform anon-the-fly digest hash of retrieved content-bodies.
digestContent
(boolean) Whether or not to perform an on-the-fly digest hash of retrievedcontent-bodies.
extractFromDirs
(boolean) Set to true to extract further URIs from SFTP directories. Default is true.
extractParent
(boolean) Set to true to extract the parent URI from all SFTP URIs. Default is true.
maxFetchKBSec
(int) The maximum KB/sec to use when fetching data from a server. The defaultof 0 means no maximum.
maxLengthBytes
(long) Maximum length in bytes to fetch. Fetch is truncated at this length. Avalue of 0 means no limit.
password
(java.lang.String) The password to send to SFTP servers. By convention, anonymous users sendtheir email address in this field.
soTimeoutMs
(int) If the socket is unresponsive for this number of milliseconds, give up.Set to zero for no timeout (Not. recommended. Could hang a thread on anunresponsive server). This timeout is used timing out socket opens andfor timing out each socket read. Make sure this value is <#getTimeoutSeconds() for optimal configuration: ensures at least oneretry read.
timeoutSeconds
(int) If the fetch is not completed in this number of seconds, give up (andretry later).
username
(java.lang.String) The username to send to SFTP servers. By convention, the default value of”anonymous” is used for publicly available SFTP sites.

FetchWhois

WHOIS Fetcher (RFC 3912). If this fetcher is enabled, Heritrix will attemptWHOIS lookups on the topmost assigned domain and the IP address of each URL.

WHOIS URIs

There is no pre-existing, canonical specification for WHOIS URIs. Whatfollows is the the format that Heritrix uses, which we propose for generaluse.

Syntax in ABNF as used in RFC 3986 Uniform Resource Identifier (URI):Generic Syntax:

whoisurl = “whois:” [ “//” host [ “:” port ] “/” ] whoisquery

whoisquery is a url-encoded string. In ABNF,whoisquery = 1*pchar where pchar is defined in RFC 3986.host and port also as defined in RFC 3986.

To resolve a WHOIS URI which specifies host[:port], open a TCP connection tothe host at the specified port (default 43), send the query (whoisquery,url-decoded) followed by CRLF, and read the response until the server closesthe connection. For more details see RFC 3912.

Resolution of a “serverless” WHOIS URI, which does not specify host[:port],is implementation-dependent.

Serverless WHOIS URIs in Heritrix

For each non-WHOIS URI processed which has an authority, FetchWhois adds 1 or2 serverless WHOIS URIs to the CrawlURI’s outlinks. These are”whois:{ipAddress}” and, if the authority includes a hostname,”whois:{topLevelDomain}”. See #addWhoisLinks(CrawlURI).

Heritrix resolves serverless WHOIS URIs by first querying an initial server,then following referrals to other servers. In pseudocode:

if query is an IPv4 address    resolve whois://#DEFAULT_IP_WHOIS_SERVER/whoisqueryelse    let domainSuffix = part of query after the last '.' (or the whole query if no '.'), url-encoded    resolve whois://#ULTRA_SUFFIX_WHOIS_SERVER/domainSuffixwhile last response refers to another server, i.e. matches regex #WHOIS_SERVER_REGEX    if we have a special query formatting rule for this whois server, apply it - see #specialQueryTemplates    resolve whois://referralServer/whoisquery

See #deferOrFinishGeneric(CrawlURI, String)

<bean id="fetchWhois" class="org.archive.modules.fetcher.FetchWhois">
  <!-- <property name="bdbModule" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="soTimeoutMs" value="" /> -->
  <!-- <property name="specialQueryTemplates" value="" /> -->
</bean>
import org.archive.modules.fetcher.FetchWhois

fetchWhois(FetchWhois) {
    // bdbModule = ''
    // serverCache = ''
    // soTimeoutMs = 0
    // specialQueryTemplates = ''
}
bdbModule
(org.archive.bdb.BdbModule)
serverCache
(org.archive.modules.net.ServerCache)
soTimeoutMs
(int) If the socket is unresponsive for this number of milliseconds, give up.Set to zero for no timeout (Not. recommended. Could hang a thread on anunresponsive server). This timeout is used timing out socket opens andfor timing out each socket read.
specialQueryTemplates
(java.util.Map<java.lang.String,java.lang.String>)

Browser Processor

BrowserProcessor

Opens a web page in a local web browser via WebDriver BiDi and runs Behaviors to interact with the page.Subresources loaded by the browser are recorded using a HTTP proxy. Must be used in conjunction withFetchHTTP2. Normally defined in the FetchChain after the link extractors.

<bean id="browserProcessor" class="org.archive.crawler.processor.BrowserProcessor">
  <!-- <property name="behaviors" value="" /> -->
  <!-- <property name="concurrency" value="20" /> -->
  <!-- <property name="executable" value="" /> -->
  <!-- <property name="options" value="" /> -->
</bean>
import org.archive.crawler.processor.BrowserProcessor

browserProcessor(BrowserProcessor) {
    // behaviors = ''
    // concurrency = 20
    // executable = ''
    // options = ''
}
behaviors
(java.util.List<org.archive.modules.behaviors.Behavior>) A list of Behaviors to run on each page.
concurrency
(int) Maximum number of web pages that can be open in the browser at once.
executable
(java.lang.String) Webdriver executable to launch. If null, will try several common paths.

Firefox can be used directly as it implements WebDriver BiDI natively. To use Chrome set this to aChromeDriver executable.

options
(java.util.List<java.lang.String>) Extra command-line options to be passed to the webdriver executable.

ExtractLinksBehavior

Extracts navigation links from the loaded page using JavaScript.

<bean id="extractLinksBehavior" class="org.archive.modules.behaviors.ExtractLinksBehavior">
</bean>
import org.archive.modules.behaviors.ExtractLinksBehavior

extractLinksBehavior(ExtractLinksBehavior) {
}

ScrollDownBehavior

Scrolls the page down until it reaches the bottom (or until a timeout is reached).

<bean id="scrollDownBehavior" class="org.archive.modules.behaviors.ScrollDownBehavior">
  <!-- <property name="scrollInterval" value="50" /> -->
  <!-- <property name="timeout" value="5000" /> -->
</bean>
import org.archive.modules.behaviors.ScrollDownBehavior

scrollDownBehavior(ScrollDownBehavior) {
    // scrollInterval = 50
    // timeout = 5000
}
scrollInterval
(int) How many milliseconds to wait between each scroll step.
timeout
(long) Maximum time to wait to reach the bottom of the page, in milliseconds.

Post-Processors

CandidatesProcessor

Processor which sends all candidate outlinks through the CandidateChain, scheduling those with non-negative statuscodes to the frontier. Also performs special handling for’discovered seeds’ – URIs, as with redirects from seeds, that may deserve special treatment to expand the scope.

<bean id="candidatesProcessor" class="org.archive.crawler.postprocessor.CandidatesProcessor">
  <!-- <property name="candidateChain" value="" /> -->
  <!-- <property name="frontier" value="" /> -->
  <!-- <property name="loggerModule" value="" /> -->
  <!-- <property name="processErrorOutlinks" value="false" /> -->
  <!-- <property name="seeds" value="" /> -->
  <!-- <property name="seedsRedirectNewSeeds" value="true" /> -->
  <!-- <property name="seedsRedirectNewSeedsAllowTLDs" value="true" /> -->
  <!-- <property name="sheetOverlaysManager" value="" /> -->
</bean>
import org.archive.crawler.postprocessor.CandidatesProcessor

candidatesProcessor(CandidatesProcessor) {
    // candidateChain = ''
    // frontier = ''
    // loggerModule = ''
    // processErrorOutlinks = false
    // seeds = ''
    // seedsRedirectNewSeeds = true
    // seedsRedirectNewSeedsAllowTLDs = true
    // sheetOverlaysManager = ''
}
candidateChain
(org.archive.modules.CandidateChain) Candidate chain
frontier
(org.archive.crawler.framework.Frontier) The frontier to use.
loggerModule
(org.archive.crawler.reporting.CrawlerLoggerModule)
processErrorOutlinks
(boolean) If true, outlinks from status codes <200 and >=400will be sent through candidates processing. Default isfalse.
seeds
(org.archive.modules.seeds.SeedModule)
seedsRedirectNewSeeds
(boolean) If enabled, any URL found because a seed redirected to it (original seedreturned 301 or 302), will also be treated as a seed, as long as the hopcount is less than {@value #SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS}.
seedsRedirectNewSeedsAllowTLDs
(boolean) If enabled, any URL found because a seed redirected to it (original seedreturned 301 or 302), will also be treated as a seed, as long as the hopcount is less than {@value #SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS}.
sheetOverlaysManager
(org.archive.crawler.spring.SheetOverlaysManager)

DispositionProcessor

A step, late in the processing of a CrawlURI, for marking-up the CrawlURI with values to affect frontier disposition, and updatinginformation that may have been affected by the fetch. This includesrobots info and other stats.

(Formerly called CrawlStateUpdater, when it did less.)

<bean id="dispositionProcessor" class="org.archive.crawler.postprocessor.DispositionProcessor">
  <!-- <property name="delayFactor" value="5.0" /> -->
  <!-- <property name="forceRetire" value="false" /> -->
  <!-- <property name="maxDelayMs" value="30000" /> -->
  <!-- <property name="maxPerHostBandwidthUsageKbSec" value="0" /> -->
  <!-- <property name="metadata" value="" /> -->
  <!-- <property name="minDelayMs" value="3000" /> -->
  <!-- <property name="respectCrawlDelayUpToSeconds" value="300" /> -->
  <!-- <property name="serverCache" value="" /> -->
</bean>
import org.archive.crawler.postprocessor.DispositionProcessor

dispositionProcessor(DispositionProcessor) {
    // delayFactor = 5.0
    // forceRetire = false
    // maxDelayMs = 30000
    // maxPerHostBandwidthUsageKbSec = 0
    // metadata = ''
    // minDelayMs = 3000
    // respectCrawlDelayUpToSeconds = 300
    // serverCache = ''
}
delayFactor
(float) How many multiples of last fetch elapsed time to wait before recontactingsame server.
forceRetire
(boolean) Whether to set a CrawlURI’s force-retired directive, retiringits queue when it finishes. Mainly intended for URI-specificoverlay settings; setting true globally will just retire all queuesafter they offer one URI, rapidly ending a crawl.
maxDelayMs
(int) never wait more than this long, regardless of multiple
maxPerHostBandwidthUsageKbSec
(int) maximum per-host bandwidth usage
metadata
(org.archive.modules.CrawlMetadata) Auto-discovered module providing configured (or overridden)User-Agent value and RobotsHonoringPolicy
minDelayMs
(int) always wait this long after one completion before recontacting sameserver, regardless of multiple
respectCrawlDelayUpToSeconds
(int) Whether to respect a ‘Crawl-Delay’ (in seconds) given in a site’srobots.txt
serverCache
(org.archive.modules.net.ServerCache)

ReschedulingProcessor

The most simple forced-rescheduling step possible: use a localsetting (perhaps overlaid to vary based on the URI) to set an exactfuture reschedule time, as a delay from now. Unless the reschedulDelaySeconds value is changed from its default, URIs are not rescheduled.

<bean id="reschedulingProcessor" class="org.archive.crawler.postprocessor.ReschedulingProcessor">
  <!-- <property name="rescheduleDelaySeconds" value="-1" /> -->
</bean>
import org.archive.crawler.postprocessor.ReschedulingProcessor

reschedulingProcessor(ReschedulingProcessor) {
    // rescheduleDelaySeconds = -1
}
rescheduleDelaySeconds
(long) amount of time to wait before forcing a URI to be rescheduleddefault of -1 means “don’t reschedule”

WARCWriterChainProcessor

WARC writer processor. The types of records that to be written can beconfigured by including or excluding WARCRecordBuilderimplementations (see #setChain(List)).

This is the default chain:

  <property name="chain">   <list>    <bean class="org.archive.modules.warc.DnsResponseRecordBuilder"/>    <bean class="org.archive.modules.warc.HttpResponseRecordBuilder"/>    <bean class="org.archive.modules.warc.WhoisResponseRecordBuilder"/>    <bean class="org.archive.modules.warc.FtpControlConversationRecordBuilder"/>    <bean class="org.archive.modules.warc.FtpResponseRecordBuilder"/>    <bean class="org.archive.modules.warc.RevisitRecordBuilder"/>    <bean class="org.archive.modules.warc.HttpRequestRecordBuilder"/>    <bean class="org.archive.modules.warc.MetadataRecordBuilder"/>   </list>  </property>

Replaces WARCWriterProcessor.

<bean id="wARCWriterChainProcessor" class="org.archive.modules.writer.WARCWriterChainProcessor">
  <!-- <property name="chain" value="" /> -->
  <!-- <property name="compress" value="true" /> -->
  <!-- <property name="directory" value="" /> -->
  <!-- <property name="frequentFlushes" value="true" /> -->
  <!-- <property name="maxFileSizeBytes" value="" /> -->
  <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
  <!-- <property name="maxWaitForIdleMs" value="" /> -->
  <!-- <property name="metadataProvider" value="" /> -->
  <!-- <property name="poolMaxActive" value="" /> -->
  <!-- <property name="prefix" value="" /> -->
  <!-- <property name="recordIDGenerator" value="" /> -->
  <!-- <property name="serverCache" value="" /> -->
  <!-- <property name="skipIdenticalDigests" value="false" /> -->
  <!-- <property name="startNewFilesOnCheckpoint" value="true" /> -->
  <!-- <property name="storePaths" value="" /> -->
  <!-- <property name="template" value="" /> -->
  <!-- <property name="writeBufferSize" value="" /> -->
</bean>
import org.archive.modules.writer.WARCWriterChainProcessor

wARCWriterChainProcessor(WARCWriterChainProcessor) {
    // chain = ''
    // compress = true
    // directory = ''
    // frequentFlushes = true
    // maxFileSizeBytes = 0
    // maxTotalBytesToWrite = 0
    // maxWaitForIdleMs = 0
    // metadataProvider = ''
    // poolMaxActive = 0
    // prefix = ''
    // recordIDGenerator = ''
    // serverCache = ''
    // skipIdenticalDigests = false
    // startNewFilesOnCheckpoint = true
    // storePaths = ''
    // template = ''
    // writeBufferSize = 0
}
chain
(java.util.List<? extends org.archive.modules.warc.WARCRecordBuilder>)
compress
(boolean) Whether to gzip-compress files when writing to disk; by default true, meaning do-compress.
directory
(org.archive.spring.ConfigPath)
frequentFlushes
(boolean) Whether to flush to underlying file frequently (at least after each record), or not. Default is true.
maxFileSizeBytes
(long) Max size of each file.
maxTotalBytesToWrite
(long) Total file bytes to write to disk. Once the size of all files on disk hasexceeded this limit, this processor will stop the crawler. A value ofzero means no upper limit.
maxWaitForIdleMs
(int) Maximum time to wait on idle writer before (possibly) creating anadditional instance.
metadataProvider
(org.archive.modules.CrawlMetadata)
poolMaxActive
(int) Maximum active files in pool. This setting cannot be varied over the lifeof a crawl.
prefix
(java.lang.String) File prefix. The text supplied here will be supplied to the naming template (below) as the ‘prefix’ variable for possible interpolation.In the default/recommended naming formula, the prefix will appear first.
recordIDGenerator
(org.archive.uid.RecordIDGenerator)
serverCache
(org.archive.modules.net.ServerCache)
skipIdenticalDigests
(boolean) Whether to skip the writing of a record when URI history information isavailable and indicates the prior fetch had an identical content digest.Note that subclass settings may provide more fine-grained control onhow identical digest content is handled; for those controls to haveeffect, this setting must not be ‘true’ (causing content to be skipped entirely). Default is false.
startNewFilesOnCheckpoint
(boolean) Whether to close output files and start new ones on checkpoint. True bydefault. If false, merely flushes writers.
storePaths
(java.util.List<org.archive.spring.ConfigPath>) Where to save files. Supply absolute or relative directory paths. If relative, paths will be interpreted relative to the local’directory’ property. order.disk-path setting. If more than onepath specified, we’ll round-robin dropping files to each. This setting is safe to change midcrawl (You can remove and add new dirs as the crawler progresses).
template
(java.lang.String) Template from which a filename is interpolated. Expressions of theform ${key} will be replaced by values from a local map of useful values (including ‘prefix’, ‘timestamp17’, and ‘serialno’) or global system properties (which includes the local hostname/port/pid).

The default template is:

”${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port}”

The default template will generate unique names under reasonable assumptions; be sure you know what you’re doing before customizing,as you could easily create filename collisions with a poorly-designedfilename template, and many downstream tools have historically assumedthat ARCs/WARCs are carefully named to preserve uniqueness.

writeBufferSize
(int) Size of buffer in front of disk-writing. Default is 256K.

DnsResponseRecordBuilder

<bean id="dnsResponseRecordBuilder" class="org.archive.modules.warc.DnsResponseRecordBuilder">
</bean>
import org.archive.modules.warc.DnsResponseRecordBuilder

dnsResponseRecordBuilder(DnsResponseRecordBuilder) {
}

FtpControlConversationRecordBuilder

<bean id="ftpControlConversationRecordBuilder" class="org.archive.modules.warc.FtpControlConversationRecordBuilder">
</bean>
import org.archive.modules.warc.FtpControlConversationRecordBuilder

ftpControlConversationRecordBuilder(FtpControlConversationRecordBuilder) {
}

FtpResponseRecordBuilder

<bean id="ftpResponseRecordBuilder" class="org.archive.modules.warc.FtpResponseRecordBuilder">
</bean>
import org.archive.modules.warc.FtpResponseRecordBuilder

ftpResponseRecordBuilder(FtpResponseRecordBuilder) {
}

HttpRequestRecordBuilder

<bean id="httpRequestRecordBuilder" class="org.archive.modules.warc.HttpRequestRecordBuilder">
</bean>
import org.archive.modules.warc.HttpRequestRecordBuilder

httpRequestRecordBuilder(HttpRequestRecordBuilder) {
}

HttpResponseRecordBuilder

<bean id="httpResponseRecordBuilder" class="org.archive.modules.warc.HttpResponseRecordBuilder">
</bean>
import org.archive.modules.warc.HttpResponseRecordBuilder

httpResponseRecordBuilder(HttpResponseRecordBuilder) {
}

MetadataRecordBuilder

<bean id="metadataRecordBuilder" class="org.archive.modules.warc.MetadataRecordBuilder">
</bean>
import org.archive.modules.warc.MetadataRecordBuilder

metadataRecordBuilder(MetadataRecordBuilder) {
}

RevisitRecordBuilder

<bean id="revisitRecordBuilder" class="org.archive.modules.warc.RevisitRecordBuilder">
</bean>
import org.archive.modules.warc.RevisitRecordBuilder

revisitRecordBuilder(RevisitRecordBuilder) {
}

WhoisResponseRecordBuilder

<bean id="whoisResponseRecordBuilder" class="org.archive.modules.warc.WhoisResponseRecordBuilder">
</bean>
import org.archive.modules.warc.WhoisResponseRecordBuilder

whoisResponseRecordBuilder(WhoisResponseRecordBuilder) {
}