用户:Koalabot/Taxobar bot[编辑]
外观
本源代码改编自en:User:Tom.Bot/Task3 code。
源代码
[编辑]public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
// global switches //////////////////////////////////////////////////////////
bool SaveSkipSummaries = false;
bool SkipPagesLargerThanLimit = false; // used with int Limit
bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
bool ManuallyPlaceTaxonbarAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual only
bool LiveDebug = false;
bool SandboxDebug = false; // auto-detects
Skip = false;
// global-use vars //////////////////////////////////////////////////////////
int Limit = 2500; // characters/bytes on a page; used with bool SkipPagesLargerThanLimit
Summary = "";
// preliminary exceptions/error checking ////////////////////////////////////
if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
if (SkipPagesLargerThanLimit)
{
string TooBig_Regex = @"^[\d\D]{" + (Limit + 1) + "}";
bool TooBig = Regex.IsMatch(ArticleText, TooBig_Regex);
if (TooBig)
{
Summary += "Too big (>" + Limit + "B). ";
Skip = true;
}
}
// check for inappropriate infoboxes
string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist|學術研究工作者|学术研究工作者)(?=\s*(?:\||\<\!\-\-))";
bool BadInfobox1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
bool BadInfobox2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
if (BadInfobox1 || BadInfobox2)
{
Summary += @"Person/scientist infobox found. ";
Skip = true;
}
// check for appropriate infoboxes
string TitleTemplates_Regex = @"\{\{\s*(?:DISPLAY ?TITLE|[Ii]talicisedtitle|[Ii]talicised[ _]+title|[Ii]talicizedtitle|[Ii]talicized[ _]+title|[Ii]talicizetitle|[Ii]talicize[ _]+title|[Ii]talicstitle|[Ii]talics[ _]+title|[Ii]talics|ITALICTITLE|[Ii]talictitle|[Ii]talic[ _]+title[ _]+infobox|[Ii]talic[ _]+title|[Ii]talic|[Ii]tal|[Rr]edirect[ _]+italic[ _]+title|[Tt]itle[ _]+italic)";
string TaxoTemplates_Regex = @"\{\{\s*(?:Template:\s*|Wikipedia:\s*)?(?:Infobox[ _]+)?(" + // prefixes
@"Taxobox|Taxo|TX|Species ?box|Subspeciesbox|Infraspeciesbox|Virusspeciesbox|Subspeciesbox/ICN|" + // taxo/species
@"Automatic[ _]+t?axobox|" + // auto
@"bacteria|microorganism|virus" + // other
@")(?=\s*(?:\||\<\!\-\-|" + TitleTemplates_Regex + @"|(?<=Automatic[ _]+t?axobox\s*)\}\}))"; // suffixes
bool NoTaxoTemplates = !Regex.IsMatch(ArticleText, TaxoTemplates_Regex, RegexOptions.IgnoreCase);
if (NoTaxoTemplates)
{
if (ManuallyCheckPagesWithoutAGoodInfobox)
{
if (!BadInfobox1 && !BadInfobox2)
{
// OK to proceed (manually)
}
else
{
// Skip is already true from 'inappropriate infoboxes' check
}
}
else
{
Summary += @"No auto/taxo/speciesbox found. ";
Skip = true;
}
}
// check for {{Taxonbar
string TaxonbarAliases_Regex = @"\{\{\s*(?:[Tt]axobar|[Tt]axon\-bar|[Tt]axonbar|[Tt]axonBar|[Tt]axonIds|[Tt]axon[ _]+bar)"; // 0 grps
bool HasTaxonbar = Regex.IsMatch(ArticleText, TaxonbarAliases_Regex, RegexOptions.IgnoreCase);
if (HasTaxonbar)
{
Summary += @"Taxonbar exists. ";
Skip = true;
}
// get wikibase_item via WP API
// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
// wish I could find a URL_Encode function that worked....
string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
string URL1 = @"https://zh.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" +
ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
string HTML1 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML1 = Tools.GetHTML(URL1);
}
catch
{
Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
if (!LiveDebug) Skip = true;
}
}
// html1 error checks ///////////////////////////////////////////////////////
string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
{
Summary = @"QID retrieval failed. ";
Skip = true;
}
if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
{
Summary = @"Unexpected QID format. ";
Skip = true;
}
// determine quantity & quality of WD properties used ///////////////////////
List<string> GoodPropertyList = new List<string>(new string[] {
// alphabetically from [[Template:Taxonbar#Taxon identifiers]]:
"P4024",
"P2036",
"P1348",
"P3594",
"P2833",
"P2026",
"P2946",
"P3398",
"P838",
"P687",
"P2464",
"P3060",
"P1940",
"P3444",
// "P830", // ignore: EOL, Encyclopedia of Life
"P1895",
"P938",
"P3101",
"P1727",
"P3100",
"P1747",
"P842",
// "P846", // ignore: GBIF, Global Biodiversity Information Facility
"P1832",
"P1421",
"P3099",
"P1076",
"P3151",
"P1391",
"P961",
"P586",
"P815",
"P627",
"P3064",
"P1991",
"P959",
"P962",
"P685",
"P4122",
"P2434",
"P3102",
// "P1070", // ignore: TPL, The Plant List
"P1772",
"P1992",
"P2040",
"P2455",
"P960",
"P1745",
"P1761",
"P3591",
"P850",
"P3288",
"P2426",
"P1746"
}); // ignores don't count towards the total property count, per [[WT:TREE#Taxonbar addition requirements]]
List<string> BadPropertyList = new List<string>(new string[] {
"P830", // ignore: EOL, Encyclopedia of Life
"P846", // ignore: GBIF, Global Biodiversity Information Facility
"P1070", // ignore: TPL, The Plant List
// remaining 13 uniques from [[d:Wikidata:WikiProject Taxonomy#Databases]]:
// [[Module:Taxonbar/conf]] needs updating (follow up after bulk run)
"P1939",
"P2752",
"P2794",
"P3088",
"P3186",
"P3322",
"P3420",
"P3606",
"P4125",
"P4194",
"P4301",
"P4311",
"P4526"
});
// get Wikidata
// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q36557
string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
string HTML2 = "";
if (!Skip && !SandboxDebug)
{
try
{
HTML2 = Tools.GetHTML(URL2);
}
catch
{
Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
if (!LiveDebug) Skip = true;
}
}
// scrape Wikidata
// example text surrounding a populated property:
// "P959": [
// {
// "mainsnak": {
// "snaktype": "value",
// "property": "P959",
// "hash": "c18d910a13321717e90ba037d26f1f1b86558128",
// "datavalue": {
// "value": "11500009",
// "type": "string"
// },
// "datatype": "external-id"
// },
int iGoodProps = 0;
int iBadProps = 0;
if (!Skip && !SandboxDebug)
{
foreach (string p in GoodPropertyList)
{
string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
bool Found = Regex.IsMatch(HTML2, p_regex);
if (Found) iGoodProps++;
}
foreach (string p in BadPropertyList)
{
string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
bool Found = Regex.IsMatch(HTML2, p_regex);
if (Found) iBadProps++;
}
if (iGoodProps == 0)
{
if (iBadProps > 0) Summary += "No good PIDs found. ";
else Summary += "No PIDs found. ";
Skip = true;
}
}
// main /////////////////////////////////////////////////////////////////////
if (!Skip)
{
if (SandboxDebug)
{
iGoodProps = 1;
QID = "1";
}
// move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Taxonbar}} that can't be fixed w/o a reparse ([[Smythea]])
// leading "\s*" & "\n" for cases like "{{reflist}}{{Malvales-stub}}" ([[Herrania mariae]])
string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*(?:[ -]stub|小作品)\s*\}\})\s*(\[\[\s*(?:Category|Cat|分类|分類)[^\[\]]+\]\])";
ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
string Plural = (iGoodProps > 1) ? "s" : "";
string TaxonbarComplete = @"{{Taxonbar|from=" + QID + @"}}";
string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ ]*(?:\{\{\s*Default ?sort|\[\[\s*(?:Category|Cat|分类|分類)))"; // better results than adding after last cat ([[Hellolycaena]])
string SuccessSummary = @"+{{[[Template:Taxonbar|Taxonbar]]|" +
@"from=" +
@"[[d:Special:EntityPage/" + QID + @"|" + QID + @"]]}} " +
@"(" + iGoodProps + @" sig. taxon ID" + Plural + @"); " +
@"WP:GenFixes on,";
bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
if (NoCat)
{
if (ManuallyPlaceTaxonbarAtEndOfPage)
{
ArticleText += "\n" + TaxonbarComplete;
Summary = SuccessSummary + " (uncategorized page) ";
}
else
{
Summary += @"No cats/defaultsort to anchor {{Taxonbar}} around. Batch manually/code later. ";
Skip = true;
}
}
else
{
ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + TaxonbarComplete, RegexOptions.IgnoreCase);
Summary = SuccessSummary;
}
}
// exception tracking ///////////////////////////////////////////////////////
if (Skip && SaveSkipSummaries && !SandboxDebug)
{
string Message = ArticleTitle + "\t" + Summary + "\n";
string File = @"Module output - Add {{Taxonbar+from}} (skip summaries).txt";
string Path = @"F:\"; // desktop
string FullPath = Path + File;
const bool APPEND = true;
Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
}
if (LiveDebug || SandboxDebug) Skip = false;
return ArticleText;
}