public class MediaWikiAPIPageExtractor extends AbstractMediaWikiAPIExtractor
| Modifier and Type | Field and Description |
|---|---|
private java.lang.String |
baseURL |
private java.lang.String[] |
contentTypes |
private boolean |
crawlClasses |
private WandoraToolLogger |
logger |
private int |
nExtracted |
private int |
progress |
private java.lang.String[] |
qType |
private java.lang.String |
queryURL |
CONTENT_TYPE_SI, LANG_SI, PAGE_SI, SI_ROOTCUSTOM_EXTRACTOR, DONE_FAILED, DONE_MANY, DONE_ONE, EXACTLY_GIVEN_URLS, FILE_EXTRACTOR, FILE_PATTERN, GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS, GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN, GIVEN_URLS_AND_LINKED_DOCUMENTS, GIVEN_URLS_AND_URL_BELOW, INFO_WAIT_WHILE_WORKING, LOG_TITLE, POINT_START_URL_TEXT, RAW_EXTRACTOR, SELECT_DIALOG_TITLE, STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE, URL_EXTRACTORCLOSE, EXECUTE, INVISIBLE, VISIBLE, WAITRETURN_ERROR, RETURN_INFO| Constructor and Description |
|---|
MediaWikiAPIPageExtractor(java.lang.String baseURL,
java.lang.String[] qType,
boolean crawl) |
| Modifier and Type | Method and Description |
|---|---|
boolean |
_extractTopicsFrom(java.io.File f,
TopicMap t) |
boolean |
_extractTopicsFrom(java.lang.String str,
TopicMap t) |
boolean |
_extractTopicsFrom(java.net.URL u,
TopicMap t) |
private void |
continueExtraction(org.wandora.dep.json.JSONObject contObject,
TopicMap t) |
private boolean |
extractTopicsFromString(java.lang.String str,
TopicMap t) |
private boolean |
extractTopicsFromURL(java.net.URL u,
TopicMap t) |
private java.lang.String |
getArticleBody(java.lang.String title) |
private java.util.List<java.lang.String> |
getArticleClasses(java.lang.String title) |
private java.util.HashMap<java.lang.String,java.lang.String> |
getArticleInfo(java.lang.String title) |
protected java.lang.String |
getBaseUrl() |
java.lang.String[] |
getContentTypes()
Returns an array of String containing the content-types this
ContentHandler can process. |
protected java.lang.String |
getQueryUrl() |
protected void |
incrementExtractions() |
private org.wandora.dep.json.JSONObject |
parse(org.wandora.dep.json.JSONObject body,
TopicMap tm) |
private void |
parsePage(org.wandora.dep.json.JSONObject page,
TopicMap tm) |
private void |
parsePage(java.lang.String title,
TopicMap tm) |
private void |
printError(org.wandora.dep.json.JSONObject body) |
private void |
printWarnings(org.wandora.dep.json.JSONObject body) |
boolean |
runInOwnThread()
Whether or not this tool should fork own thread.
|
protected void |
setQueryUrl(java.lang.String u) |
boolean |
useURLCrawler() |
getContentTypeTopic, getLangTopic, getLangTopic, getMediaWikiClass, getOrCreateTopic, getOrCreateTopic, getWandoraClassTopic, makeSubclassOfacceptBrowserExtractRequest, addCrawlerUrl, browserExtractorConsumesPlainText, buildSI, buildSL, clearMasterSubject, createAssociation, createAssociation, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, croppedFilename, croppedFilename, croppedUrlString, croppedUrlString, doBrowserExtract, dropExtract, dropExtract, dropExtract, execute, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFromText, getBrowserExtractorName, getCrawlerMode, getDescription, getExtractorType, getForceContent, getForceFiles, getForceUrls, getGUIText, getGUIText, getIcon, getInterruptsHandled, getMasterSubject, getName, getType, getWandora, handle, handleContent, handleCustomType, handleFiles, handleForcedContent, handleInterrupt, handleStringContent, handleUrls, initializeCustomType, instantDropHandle, makeSubclassOfWandoraClass, setData, setDisplayName, setForceContent, setForceFiles, setForceUrls, setMasterSubject, setMasterSubject, setTopicMap, setupCrawler, setWandora, takeNap, urlEncode, useTempTopicMapaddUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, configure, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isConfigurable, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptionsclone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, waitconfigure, execute, execute, getContext, getToolMenuItem, hlog, initialize, isConfigurable, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptionsforceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setStateprivate int nExtracted
private java.lang.String baseURL
private java.lang.String queryURL
private boolean crawlClasses
private java.lang.String[] qType
private int progress
private WandoraToolLogger logger
private final java.lang.String[] contentTypes
MediaWikiAPIPageExtractor(java.lang.String baseURL,
java.lang.String[] qType,
boolean crawl)
public boolean useURLCrawler()
useURLCrawler in class AbstractExtractorpublic boolean runInOwnThread()
AbstractWandoraToolrunInOwnThread in class AbstractExtractorprotected void setQueryUrl(java.lang.String u)
protected java.lang.String getBaseUrl()
protected java.lang.String getQueryUrl()
protected void incrementExtractions()
public java.lang.String[] getContentTypes()
HandlerContentHandler can process.getContentTypes in interface HandlergetContentTypes in class AbstractExtractorpublic boolean _extractTopicsFrom(java.io.File f,
TopicMap t)
throws java.lang.Exception
_extractTopicsFrom in class AbstractExtractorjava.lang.Exceptionpublic boolean _extractTopicsFrom(java.net.URL u,
TopicMap t)
throws java.lang.Exception
_extractTopicsFrom in class AbstractExtractorjava.lang.Exceptionpublic boolean _extractTopicsFrom(java.lang.String str,
TopicMap t)
throws java.lang.Exception
_extractTopicsFrom in class AbstractExtractorjava.lang.Exceptionprivate boolean extractTopicsFromString(java.lang.String str,
TopicMap t)
private boolean extractTopicsFromURL(java.net.URL u,
TopicMap t)
private void continueExtraction(org.wandora.dep.json.JSONObject contObject,
TopicMap t)
throws java.lang.Exception
java.lang.Exceptionprivate org.wandora.dep.json.JSONObject parse(org.wandora.dep.json.JSONObject body,
TopicMap tm)
throws org.wandora.dep.json.JSONException,
TopicMapException,
java.io.IOException
org.wandora.dep.json.JSONExceptionTopicMapExceptionjava.io.IOExceptionprivate void parsePage(org.wandora.dep.json.JSONObject page,
TopicMap tm)
throws org.wandora.dep.json.JSONException,
TopicMapException,
java.io.IOException
org.wandora.dep.json.JSONExceptionTopicMapExceptionjava.io.IOExceptionprivate void parsePage(java.lang.String title,
TopicMap tm)
throws org.wandora.dep.json.JSONException,
TopicMapException,
java.io.IOException
org.wandora.dep.json.JSONExceptionTopicMapExceptionjava.io.IOExceptionprivate java.lang.String getArticleBody(java.lang.String title)
throws java.io.IOException
java.io.IOExceptionprivate java.util.HashMap<java.lang.String,java.lang.String> getArticleInfo(java.lang.String title)
throws java.io.IOException
java.io.IOExceptionprivate java.util.List<java.lang.String> getArticleClasses(java.lang.String title)
throws java.io.IOException
java.io.IOExceptionprivate void printError(org.wandora.dep.json.JSONObject body)
throws org.wandora.dep.json.JSONException
org.wandora.dep.json.JSONExceptionprivate void printWarnings(org.wandora.dep.json.JSONObject body)
throws org.wandora.dep.json.JSONException
org.wandora.dep.json.JSONExceptionCopyright 2004-2015 Wandora Team