Utente:GiacoBot/itwiki
Aspetto
# -*- coding: utf-8 -*- # # (C) Daniel Herding, 2004 # # Distributed under the terms of the MIT license. # __version__='$Id: replace.py,v 1.102 2006/03/12 16:35:54 wikipedian Exp $' from __future__ import generators import sys, re import wikipedia, pagegenerators, catlib, config # Summary messages in different languages # NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes' # below. msg = { 'de':u'Bot: Automatisierte Textersetzung %s', 'en':u'Robot: Automated text replacement %s', 'es':u'Robot: Reemplazo automático de texto %s', 'fr':u'Bot : Remplacement de texte automatisé %s', 'hu':u'Robot: Automatikus szövegcsere %s', 'ia':u'Robot: Reimplaciamento automatic de texto %s', 'is':u'Vélmenni: breyti texta %s', 'it':u'Bot: Sostituzione automatica del testo %s', 'ka':u'რáƒáƒ‘áƒáƒ¢áƒ˜: ტექსტის áƒáƒ•áƒ¢áƒáƒ›áƒáƒ¢áƒ£áƒ ი შეცვლრ%s', 'lt':u'Botas: Automatinis teksto keitimas %s', 'pt':u'Bot: Mudança automática %s', 'sr':u'Бот: ÐутоматÑка замена текÑта %s', } # Predefined replacements tasks. fixes = { # per it.wikipedia 'accenti': { 'regex': True, 'msg': { 'it':u'Correzione di alcuni errori comuni contenuti in questa [[Discussioni_Wikipedia:Bot/Sostituzioni/Espressioni_regolari|lista]]', }, 'replacements': [ #congiunzioni (e non) terminanti in -chè (u'\\b([Aa])bbenchè\\b', ur'\1bbenché'), (u'\\b([aA])cciocchè\\b', ur'\1cciocché'), (u'\\b([aA])ffinchè\\b', ur'\1ffinché'), (u'\\b([aA])lcunchè\\b', ur'\1lcunché'), (u'\\b([aA])llorchè\\b', ur'\1llorché'), (u'\\b([aA])ltrochè\\b', ur'\1ltroché'), (u'\\b([aA])ncorchè\\b', ur'\1ncorché'), (u'\\b([aA])nzichè\\b', ur'\1nziché'), (u'\\b([aA])ttesochè\\b', ur'\1ttesoché'), (u'\\b([aA])vvegnachè\\b', ur'\1vvegnaché'), (u"\\b([aA])vvegnadiochè\\b", ur"\1vvegnadioché"), (u"\\b([aA])vvengachè\\b", ur"\1vvengaché"), (u"\\b([aA])vvengadiochè\\b", ur"\1vvengadioché"), (u'\\b([bB])enchè\\b', ur'\1enché'), (u'\\b([cC])hecchè\\b', ur'\1hecché'), (u"\\b([cC])iocchè\\b", ur"\1iocché"), (u'\\b([cC])omecchè\\b', ur'\1omecché'), (u"\\b([cC])onciofossechè\\b", ur"\1onciofosseché"), (u'\\b([cC])ontuttochè\\b', ur'\1ontuttoché'), (u'\\b([cC])osicchè\\b', ur'\1osicché'), (u'\\b([cC])otalchè\\b', ur'\1otalché'), (u'\\b([dD])acchè\\b', ur'\1acché'), (u'\\b([dD])appoichè\\b', ur'\1appoiché'), (u'\\b([dD])imodochè\\b', ur'\1imodoché'), (u"\\b([dD])opochè\\b", ur"\1opoché"), (u"\\b([dD])opodichè\\b", ur"\1opodiché"), (u'\\b([eE])ssendochè\\b', ur'\1ssendoché'), (u'\\b([fF])inattantochè\\b', ur'\1inattantoché'), (u'\\b([fF])inchè\\b', ur'\1inché'), (u'\\b([fF])intantochè\\b', ur'\1intantoché'), (u"\\b([fF])inacchè\\b", ur"\1inacché"), (u"\\b([fF])inattantochè\\b", ur"\1inattantoché"), (u'\\b([fF])uorchè\\b', ur'\1uorché'), (u'\\b([gG])iacchè\\b', ur'\1iacché'), (u'\\b([gG])ranchè\\b', ur'\1ranché'), (u"\\b([gG])iafossechè\\b", ur"\1iafosseché"), (u"\\b([gG])iafossecosachè\\b", ur"\1iafossecosaché"), (u"\\b([iI])nfinattantochè\\b", ur"\1nfinattantoché"), (u'\\b([lL])orchè\\b', ur'\1orché'), (u"\\b([iI])nquantochè\\b", ur"\1nquantoché"), (u'\\b([mM])acchè\\b', ur'\1acché'), (u'\\b([nN])onchè\\b', ur'\1onché'), (u"\\b([nN])onsochè\\b", ur"\1onsoché"), (u'\\b([oO])ltrechè\\b', ur'\1ltreché'), (u"\\b([oO])ndechè\\b", ur"\1ndeché"), (u'\\b([pP])erchè\\b', ur'\1erché'), (u'\\b([pP])erciocchè\\b', ur'\1erciocché'), (u'\\b([pP])erlochè\\b', ur'\1erloché'), (u'\\b([pP])erocchè\\b', ur'\1erocché'), (u'\\b([pP])oichè\\b', ur'\1oiché'), (u"\\b([pP])osciachè\\b", ur"\1osciaché"), (u'\\b([pP])ressochè\\b', ur'\1ressoché'), (u'\\b([pP])urchè\\b', ur'\1urché'), (u"\\b([qQ])uantochè\\b", ur"\1uantoché"), (u"\\b([qQ])uasichè\\b", ur"\1uasiché"), (u"\\b([sS])econdochè\\b", ur"\1econdoché"), (u'\\b([sS])ennonchè\\b', ur'\1ennonché'), (u'\\b([sS])enonchè\\b', ur'\1enonché'), (u'\\b([sS])icchè\\b', ur'\1icché'), (u'\\b([sS])inattantochè\\b', ur'\1inattantoché'), (u'\\b([sS])inchè\\b', ur'\1inché'), (u'\\b([sS])intantochè\\b', ur'\1intantoché'), (u"\\b([sS])tantechè\\b", ur"\1tanteché"), (u'\\b([tT])alchè\\b', ur'\1alché'), (u"\\b([tT])almentechè\\b", ur"\1almenteché"), (u'\\b([tT])antochè\\b', ur'\1antoché'), (u"\\b([tT])rannechè\\b", ur"\1ranneché"), (u'\\b([tT])uttochè\\b', ur'\1uttoché'), #passati remoti (u"\\b([aA])bbattè\\b", ur"\1bbatté"), (u"\\b([aA])ccedè\\b", ur"\1ccedé"), (u"\\b([aA])ddissè\\b", ur"\1ddissé"), (u"\\b([aA])dempiè\\b", ur"\1dempié"), (u"\\b([aA])nnettè\\b", ur"\1nnetté"), (u"\\b([aA])ntiprendè\\b", ur"\1ntiprendé"), (u"\\b([aA])ppartenè\\b", ur"\1ppartené"), (u"\\b([aA])ppendè\\b", ur"\1ppendé"), (u"\\b([aA])pprendè\\b", ur"\1pprendé"), (u"\\b([aA])rrendè\\b", ur"\1rrendé"), (u"\\b([aA])rrompè\\b", ur"\1rrompé"), (u"\\b([aA])ssistè\\b", ur"\1ssisté"), (u"\\b([aA])ssolvè\\b", ur"\1ssolvé"), (u"\\b([aA])stenè\\b", ur"\1stené"), (u"\\b([aA])ttenè\\b", ur"\1ttené"), (u"\\b([aA])vvedè\\b", ur"\1vvedé"), (u"\\b([bB])attè\\b", ur"\1atté"), (u"\\b([cC])edè\\b", ur"\1edé"), (u"\\b([cC])ernè\\b", ur"\1erné"), (u"\\b([cC])hiedè\\b", ur"\1hiedé"), (u"\\b([cC])ombattè\\b", ur"\1ombatté"), (u"\\b([cC])ompetè\\b", ur"\1ompeté"), (u"\\b([cC])ompiè\\b", ur"\1ompié"), (u"\\b([cC])omprendè\\b", ur"\1omprendé"), (u"\\b([cC])oncedè\\b", ur"\1oncedé"), (u"\\b([cC])oncernè\\b", ur"\1oncerné"), (u"\\b([cC])onnettè\\b", ur"\1onnetté"), (u"\\b([cC])onsistè\\b", ur"\1onsisté"), (u"\\b([cC])ontenè\\b", ur"\1ontené"), (u"\\b([cC])ontrobattè\\b", ur"\1ontrobatté"), (u"\\b([cC])onvedè\\b", ur"\1onvedé"), (u"\\b([cC])onvergè\\b", ur"\1onvergé"), (u"\\b([cC])onvivè\\b", ur"\1onvivé"), (u"\\b([cC])orrompè\\b", ur"\1orrompé"), (u"\\b([cC])redè\\b", ur"\1redé"), (u"\\b([dD])ecedè\\b", ur"\1ecedé"), (u"\\b([dD])eflettè\\b", ur"\1efletté"), (u"\\b([dD])elinquè\\b", ur"\1elinqué"), (u"\\b([dD])esistè\\b", ur"\1esisté"), (u"\\b([dD])etenè\\b", ur"\1etené"), (u"\\b([dD])evolvè\\b", ur"\1evolvé"), (u"\\b([dD])ibattè\\b", ur"\1ibatté"), (u"\\b([dD])ipendè\\b", ur"\1ipendé"), (u"\\b([dD])iprendè\\b", ur"\1iprendé"), (u"\\b([dD])irimè\\b", ur"\1irimé"), (u"\\b([dD])iscernè\\b", ur"\1iscerné"), (u"\\b([dD])isottenè\\b", ur"\1isottené"), (u"\\b([dD])isperdè\\b", ur"\1isperdé"), (u"\\b([dD])isplendè\\b", ur"\1isplendé"), (u"\\b([dD])issolvè\\b", ur"\1issolvé"), (u"\\b([dD])issovvennè\\b", ur"\1issovvenné"), (u"\\b([dD])istemè\\b", ur"\1istemé"), (u"\\b([dD])isvolvè\\b", ur"\1isvolvé"), (u"\\b([eE])ccedè\\b", ur"\1ccedé"), (u"\\b([eE])lidè\\b", ur"\1lidé"), (u"\\b([eE])ludè\\b", ur"\1ludé"), (u"\\b([eE])rompè\\b", ur"\1rompé"), (u"\\b([eE])sigè\\b", ur"\1sigé"), (u"\\b([eE])simè\\b", ur"\1simé"), (u"\\b([eE])sistè\\b", ur"\1sisté"), (u"\\b([eE])spandè\\b", ur"\1spandé"), (u"\\b([eE])stroquè\\b", ur"\1stroqué"), (u"\\b([eE])volvè\\b", ur"\1volvé"), (u"\\b([fF])endè\\b", ur"\1endé"), (u"\\b([fF])ervè\\b", ur"\1ervé"), (u"\\b([fF])lettè\\b", ur"\1letté"), (u"\\b([fF])rapprendè\\b", ur"\1rapprendé"), (u"\\b([fF])remè\\b", ur"\1remé"), (u"\\b([gG])enuflettè\\b", ur"\1enufletté"), (u"\\b([iI])mbattè\\b", ur"\1mbatté"), (u"\\b([iI])mbevè\\b", ur"\1mbevé"), (u"\\b([iI])mpiè\\b", ur"\1mpié"), (u"\\b([iI])mprendè\\b", ur"\1mprendé"), (u"\\b([iI])ncedè\\b", ur"\1ncedé"), (u"\\b([iI])ncombè\\b", ur"\1ncombé"), (u"\\b([iI])nfremè\\b", ur"\1nfremé"), (u"\\b([iI])nsistè\\b", ur"\1nsisté"), (u"\\b([iI])ntercedè\\b", ur"\1ntercedé"), (u"\\b([iI])nterprendè\\b", ur"\1nterprendé"), (u"\\b([iI])nterrompè\\b", ur"\1nterrompé"), (u"\\b([iI])ntessè\\b", ur"\1ntessé"), (u"\\b([iI])ntraprendè\\b", ur"\1ntraprendé"), (u"\\b([iI])ntrarompè\\b", ur"\1ntrarompé"), (u"\\b([iI])ntratessè\\b", ur"\1ntratessé"), (u"\\b([iI])ntrattenè\\b", ur"\1ntrattené"), (u"\\b([iI])ntravedè\\b", ur"\1ntravedé"), (u"\\b([iI])ntroflettè\\b", ur"\1ntrofletté"), (u"\\b([iI])rrompè\\b", ur"\1rrompé"), (u"\\b([mM])antenè\\b", ur"\1antené"), (u"\\b([mM])ietè\\b", ur"\1ieté"), (u"\\b([oO])ttenè\\b", ur"\1ttené"), (u"\\b([pP])endè\\b", ur"\1endé"), (u"\\b([pP])erdè\\b", ur"\1erdé"), (u"\\b([pP])ersistè\\b", ur"\1ersisté"), (u"\\b([pP])iovè\\b", ur"\1iové"), (u"\\b([pP])ossedè\\b", ur"\1ossedé"), (u"\\b([pP])otè\\b", ur"\1oté"), (u"\\b([pP])recedè\\b", ur"\1recedé"), (u"\\b([pP])reesistè\\b", ur"\1reesisté"), (u"\\b([pP])remè\\b", ur"\1remé"), (u"\\b([pP])rendè\\b", ur"\1rendé"), (u"\\b([pP])rescindè\\b", ur"\1rescindé"), (u"\\b([pP])resiedè\\b", ur"\1resiedé"), (u"\\b([pP])revedè\\b", ur"\1revedé"), (u"\\b([pP])rocedè\\b", ur"\1rocedé"), (u"\\b([pP])ropendè\\b", ur"\1ropendé"), (u"\\b([pP])rorompè\\b", ur"\1rorompé"), (u"\\b([pP])rovolvè\\b", ur"\1rovolvé"), (u"\\b([rR])apprendè\\b", ur"\1apprendé"), (u"\\b([rR])attenè\\b", ur"\1attené"), (u"\\b([rR])avvedè\\b", ur"\1avvedé"), (u"\\b([rR])ecedè\\b", ur"\1ecedé"), (u"\\b([rR])edigè\\b", ur"\1edigé"), (u"\\b([rR])endè\\b", ur"\1endé"), (u"\\b([rR])esistè\\b", ur"\1esisté"), (u"\\b([rR])etrocedè\\b", ur"\1etrocedé"), (u"\\b([rR])iannettè\\b", ur"\1iannetté"), (u"\\b([rR])ibattè\\b", ur"\1ibatté"), (u"\\b([rR])icedè\\b", ur"\1icedé"), (u"\\b([rR])icevè\\b", ur"\1icevé"), (u"\\b([rR])ichiedè\\b", ur"\1ichiedé"), (u"\\b([rR])iconnettè\\b", ur"\1iconnetté"), (u"\\b([rR])iconverrè\\b", ur"\1iconverré"), (u"\\b([rR])icredè\\b", ur"\1icredé"), (u"\\b([rR])iedè\\b", ur"\1iedé"), (u"\\b([rR])iempiè\\b", ur"\1iempié"), (u"\\b([rR])iflettè\\b", ur"\1ifletté"), (u"\\b([rR])ingodè\\b", ur"\1ingodé"), (u"\\b([rR])ipentè\\b", ur"\1ipenté"), (u"\\b([rR])ipetè\\b", ur"\1ipeté"), (u"\\b([rR])iprendè\\b", ur"\1iprendé"), (u"\\b([rR])isedè\\b", ur"\1isedé"), (u"\\b([rR])isiedè\\b", ur"\1isiedé"), (u"\\b([rR])isolvè\\b", ur"\1isolvé"), (u"\\b([rR])isplendè\\b", ur"\1isplendé"), (u"\\b([rR])itenè\\b", ur"\1itené"), (u"\\b([rR])ivedè\\b", ur"\1ivedé"), (u"\\b([rR])ivendè\\b", ur"\1ivendé"), (u"\\b([rR])ivivè\\b", ur"\1ivivé"), (u"\\b([rR])ompè\\b", ur"\1ompé"), (u"\\b([sS])battè\\b", ur"\1batté"), (u"\\b([sS])candè\\b", ur"\1candé"), (u"\\b([sS])cernè\\b", ur"\1cerné"), (u"\\b([sS])connettè\\b", ur"\1connetté"), (u"\\b([sS])ecernè\\b", ur"\1ecerné"), (u"\\b([sS])fottè\\b", ur"\1fotté"), (u"\\b([sS])occombè\\b", ur"\1occombé"), (u"\\b([sS])oprassedè\\b", ur"\1oprassedé"), (u"\\b([sS])opravvivè\\b", ur"\1opravvivé"), (u"\\b([sS])orprendè\\b", ur"\1orprendé"), (u"\\b([sS])ostenè\\b", ur"\1ostené"), (u"\\b([sS])pandè\\b", ur"\1pandé"), (u"\\b([sS])perdè\\b", ur"\1perdé"), (u"\\b([sS])plendè\\b", ur"\1plendé"), (u"\\b([sS])premè\\b", ur"\1premé"), (u"\\b([sS])ternè\\b", ur"\1terné"), (u"\\b([sS])trafottè\\b", ur"\1trafotté"), (u"\\b([sS])travedè\\b", ur"\1travedé"), (u"\\b([sS])tridè\\b", ur"\1tridé"), (u"\\b([tT])emè\\b", ur"\1emé"), (u"\\b([tT])enè\\b", ur"\1ené"), (u"\\b([tT])essè\\b", ur"\1essé"), (u"\\b([tT])ralucè\\b", ur"\1ralucé"), (u"\\b([tT])ransigè\\b", ur"\1ransigé"), (u"\\b([tT])rattenè\\b", ur"\1rattené"), (u"\\b([tT])ravedè\\b", ur"\1ravedé"), (u"\\b([vV])edè\\b", ur"\1edé"), (u"\\b([vV])endè\\b", ur"\1endé"), (u"\\b([vV])ertè\\b", ur"\1erté"), #Termini d'origine francese (ed italiani come caffè) (u'\\b([aA])ntirè\\b', ur'\1ntiré'), (u'\\b([aA])utodafè\\b', ur'\1utodafé'), (u'\\b([cC])annetè\\b', ur'\1anneté'), (u'\\b([cC])apitonnè\\b', ur'\1apitonné'), (u'\\b([cC])lichè\\b', ur'\1liché'), (u'\\b([cC])loisonnè\\b', ur'\1loisonné'), (u'\\b([cC])onsommè\\b', ur'\1onsommé'), (u"\\b([cC])impanzè\\b", ur"\1impanzé"), (u'\\b([cC])oupè\\b', ur'\1oupé'), (u'\\b([cC])raquelè\\b', ur'\1raquelé'), (u'\\b([dD])ecolletè\\b', ur'\1ecolleté'), (u'\\b([dD])écolletè\\b', ur'\1écolleté'), (u'\\b([dD])efilè\\b', ur'\1efilé'), (u'\\b([dD])éfilè\\b', ur'\1éfilé'), (u'\\b([dD])egagè\\b', ur'\1egagé'), (u'\\b([dD])égagè\\b', ur'\1égagé'), (u'\\b([dD])elavè\\b', ur'\1elavé'), (u'\\b([dD])élavè\\b', ur'\1élavé'), (u'\\b([dD])emodè\\b', ur'\1emodé'), (u'\\b([dD])émodè\\b', ur'\1émodé'), (u'\\b([dD])eracinè\\b', ur'\1eraciné'), (u'\\b([dD])éracinè\\b', ur'\1éraciné'), (u'\\b([dD])eshabillè\\b', ur'\1eshabillé'), (u'\\b([dD])éshabillè\\b', ur'\1éshabillé'), (u'\\b([eE])cartè\\b', ur'\1carté'), (u'\\b([eE])nfant gƒtè\\b', ur'\1nfant gƒté'), (u'\\b([eE])ngagè\\b', ur'\1ngagé'), (u'\\b([fF])lambè\\b', ur'\1lambé'), (u'\\b([fF])oncè\\b', ur'\1oncé'), (u'\\b([fF])risè\\b', ur'\1risé'), (u'\\b([gG])aufrè\\b', ur'\1aufré'), (u'\\b([gG])lacè\\b', ur'\1lacé'), (u"\\b([gG])ranmercè\\b", ur"\1ranmercé"), (u'\\b([hH])abituè\\b', ur'\1abitué'), (u'\\b([hH])ôtel meublè\\b', ur'\1ôtel meublé'), (u'\\b([iI])mprimè\\b', ur'\1mprimé'), (u'\\b([iI])nterrè\\b', ur'\1nterré'), (u'\\b([kK])aritè\\b', ur'\1arité'), (u'\\b([mM])arron glacè\\b', ur'\1arron glacé'), (u'\\b([mM])atelassè\\b', ur'\1atelassé'), (u'\\bmercè\\b', ur'mercé'), (u'\\b([mM])erzè\\b', ur'\1erzé'), (u'\\b([mM])eublè\\b', ur'\1eublé'), (u'\\b([mM])oirè\\b', ur'\1oiré'), (u'\\b([mM])oulinè\\b', ur'\1ouliné'), (u'\\b([nN])egligè\\b', ur'\1egligé'), (u'\\b([nN])égligè\\b', ur'\1égligé'), (u"\\b([nN])ontiscordardimè\\b", ur"\1ontiscordardimé"), (u'\\b([pP])ancarrè\\b', ur'\1ancarré'), (u'\\b([pP])âtè\\b', ur'\1âté'), (u'\\b([sS])aint-honorè\\b', ur'\1aint-honoré'), (u'\\b([sS])cimpanz[eè]\\b', ur'\1cimpanzé'), (u'\\b([sS])eparè\\b', ur'\1eparé'), (u'\\b([sS])oufflè\\b', ur'\1oufflé'), (u'\\b([tT])amurè\\b', ur'\1amuré'), (u'\\b([tT])rentatrè\\b', ur'\1rentatré'), (u'\\b([tT])ruffè\\b', ur'\1ruffé'), (u'\\b([vV])arietè\\b', ur'\1arieté'), (u'\\b([vV])ariétè\\b', ur'\1ariété'), (u'\\b([vV])icerè\\b', ur'\1iceré'), (u'\\b([vV])entitrè\\b', ur'\1entitré'), (u'\\b([aA])himé\\b', ur'\1himè'), (u'\\b([aA])mmazzacaffé\\b', ur'\1mmazzacaffè'), (u'\\b([aA])ppié\\b', ur'\1ppiè'), (u'\\b([bB])igné\\b', ur'\1ignè'), (u'\\b([bB])uffé\\b', ur'\1uffè'), (u'\\b([cC])abaré\\b', ur'\1abarè'), (u'\\b([cC])abriolé\\b', ur'\1abriolè'), (u'\\b([cC])anapé\\b', ur'\1anapè'), (u'\\b([cC])arcadé\\b', ur'\1arcadè'), (u'\\b([cC])hedivé\\b', ur'\1hedivè'), (u'\\b([cC])ioé\\b', ur'\1ioè'), (u'\\b([cC])occodé\\b', ur'\1occodè'), (u'\\b([cC])ontrobuffé\\b', ur'\1ontrobuffè'), (u'\\b([cC])orvé\\b', ur'\1orvè'), (u'\\b([cC])roscé\\b', ur'\1roscè'), (u'\\b([cC])upé\\b', ur'\1upè'), (u'\\b([dD])appié\\b', ur'\1appiè'), (u'\\b([dD])osacaffé\\b', ur'\1osacaffè'), (u'\\b([eE])uhoé\\b', ur'\1uhoè'), (u'\\b([fF])orfé\\b', ur'\1orfè'), (u'\\b([kK])arkadé\\b', ur'\1arkadè'), (u'\\b([kK])edivé\\b', ur'\1edivè'), (u'\\b([lL])acché\\b', ur'\1acchè'), (u'\\b([mM])acinacaffé\\b', ur'\1acinacaffè'), (u'\\b([mM])acramé\\b', ur'\1acramè'), (u'\\b([mM])ordoré\\b', ur'\1ordorè'), (u'\\b([mM])usmé\\b', ur'\1usmè'), (u'\\b([nN])arghilé\\b', ur'\1arghilè'), (u'\\b([pP])arché\\b', ur'\1archè'), (u'\\b([pP])uré\\b', ur'\1urè'), (u'\\b([rR])adiorelé\\b', ur'\1adiorelè'), (u'\\b([rR])amié\\b', ur'\1amiè'), (u'\\b([sS])ufflé\\b', ur'\1ufflè'), (u'\\b([tT])oppé\\b', ur'\1oppè'), (u'\\b([tT])ostacaffé\\b', ur'\1ostacaffè'), (u'\\b([tT])uppé\\b', ur'\1uppè'), (u'\\b([vV])ahiné\\b', ur'\1ahinè'), (u'\\bGiosué\\b', ur'Giosuè'), (u'\\bMosé\\b', ur'Mosè'), # Altre sostituzioni (u"(?m)(== ?[Ll]Collegamenti Esterni ?==)", ur"== Collegamenti esterni =="), (u"(?m)(== ?[Ll]ink [Ee]sterni ?==)", ur"== Collegamenti esterni =="), (u"(?m)(== ?[Vv]edi [Aa]nche ?==)", ur"== Voci correlate =="), ] }, } class XmlDumpReplacePageGenerator: """ Generator which will yield Pages to pages that might contain text to replace. These pages will be retrieved from a local XML dump file (cur table). """ def __init__(self, xmlFilename, replacements, exceptions): """ Arguments: * xmlFilename - The dump's path, either absolute or relative * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). * exceptions - A list of compiled regular expression; pages which contain text that matches one of these won't be changed. """ self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions def __iter__(self): import xmlreader mysite = wikipedia.getSite() dump = xmlreader.XmlDump(self.xmlFilename) for entry in dump.parse(): skip_page = False for exception in self.exceptions: if exception.search(entry.text): skip_page = True break if not skip_page: # TODO: leave out pages that only have old inside nowiki, comments, math for old, new in self.replacements: if old.search(entry.text): yield wikipedia.Page(mysite, entry.title) break class ReplaceRobot: """ A bot that can do text replacements. """ def __init__(self, generator, replacements, exceptions = [], acceptall = False): """ Arguments: * generator - A generator that yields Page objects. * replacements - A list of 2-tuples of original text (as a compiled regular expression) and replacement text (as a string). * exceptions - A list of compiled regular expression; pages which contain text that matches one of these won't be changed. * acceptall - If True, the user won't be prompted before changes are made. """ self.generator = generator self.replacements = replacements self.exceptions = exceptions self.acceptall = acceptall def checkExceptions(self, original_text): """ If one of the exceptions applies for the given text, returns the substring which matches the exception. Otherwise it returns None. """ for exception in self.exceptions: hit = exception.search(original_text) if hit: return hit.group(0) return None def doReplacements(self, original_text): """ Returns the text which is generated by applying all replacements to the given text. """ new_text = original_text for old, new in self.replacements: new_text = wikipedia.replaceExceptMathNowikiAndComments(new_text, old, new) return new_text def run(self): """ Starts the robot. """ # Run the generator which will yield Pages which might need to be # changed. for page in self.generator: try: # Load the page's text from the wiki original_text = page.get() if not page.canBeEdited(): wikipedia.output(u'Skipping locked page %s' % page.title()) continue except wikipedia.NoPage: wikipedia.output(u'Page %s not found' % page.title()) continue except wikipedia.IsRedirectPage: original_text = page.get(get_redirect=True) match = self.checkExceptions(original_text) # skip all pages that contain certain texts if match: wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match)) else: new_text = self.doReplacements(original_text) if new_text == original_text: wikipedia.output('No changes were necessary in %s' % page.title()) else: wikipedia.output(u'\n>>> %s <<<' % page.title()) wikipedia.showDiff(original_text, new_text) if not self.acceptall: choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N') if choice in ['a', 'A']: self.acceptall = True if self.acceptall or choice in ['y', 'Y']: page.put(new_text) def prepareRegexForMySQL(pattern): pattern = pattern.replace('\s', '[:space:]') pattern = pattern.replace('\d', '[:digit:]') pattern = pattern.replace('\w', '[:alnum:]') pattern = pattern.replace("'", "\\" + "'") #pattern = pattern.replace('\\', '\\\\') #for char in ['[', ']', "'"]: # pattern = pattern.replace(char, '\%s' % char) return pattern def main(): gen = None # How we want to retrieve information on which pages need to be changed. # Can either be 'xmldump', 'textfile' or 'userinput'. source = None # Array which will collect commandline parameters. # First element is original text, second element is replacement text. commandline_replacements = [] # A list of 2-tuples of original text and replacement text. replacements = [] # Don't edit pages which contain certain texts. exceptions = [] # Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False # Predefined fixes from dictionary 'fixes' (see above). fix = None # the dump's path, either absolute or relative, which will be used when source # is 'xmldump'. xmlFilename = None useSql = False # the textfile's path, either absolute or relative, which will be used when # source is 'textfile'. textfilename = None # the category name which will be used when source is 'category'. categoryname = None # pages which will be processed when the -page parameter is used PageTitles = [] # a page whose referrers will be processed when the -ref parameter is used referredPageTitle = None # a page whose links will be processed when the -links parameter is used linkingPageTitle = None # will become True when the user presses a ('yes to all') or uses the -always # commandline paramater. acceptall = False # Which namespaces should be processed? # default to [] which means all namespaces will be processed namespaces = [] # Which page to start startpage = None # Google query googleQuery = None # Load default summary message. wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg)) # Read commandline parameters. for arg in wikipedia.handleArgs(): if arg == '-regex': regex = True elif arg.startswith('-file'): if len(arg) >= 6: textfilename = arg[6:] gen = pagegenerators.TextfilePageGenerator(textfilename) elif arg.startswith('-cat'): if len(arg) == 4: categoryname = wikipedia.input(u'Please enter the category name:') else: categoryname = arg[5:] cat = catlib.Category(wikipedia.getSite(), 'Category:%s' % categoryname) gen = pagegenerators.CategorizedPageGenerator(cat) elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = wikipedia.input(u'Please enter the XML dump\'s filename:') else: xmlFilename = arg[5:] elif arg =='-sql': useSql = True elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append(wikipedia.input(u'Which page do you want to chage?')) else: PageTitles.append(arg[6:]) source = 'specificPages' elif arg.startswith('-ref'): if len(arg) == 4: referredPageTitle = wikipedia.input(u'Links to which page should be processed?') else: referredPageTitle = arg[5:] referredPage = wikipedia.Page(wikipedia.getSite(), referredPageTitle) gen = pagegenerators.ReferringPageGenerator(referredPage) elif arg.startswith('-links'): if len(arg) == 6: linkingPageTitle = wikipedia.input(u'Links from which page should be processed?') else: linkingPageTitle = arg[7:] linkingPage = wikipedia.Page(wikipedia.getSite(), linkingPageTitle) gen = pagegenerators.LinkedPageGenerator(linkingPage) elif arg.startswith('-start'): if len(arg) == 6: firstPageTitle = wikipedia.input(u'Which page do you want to chage?') else: firstPageTitle = arg[7:] namespace = wikipedia.Page(wikipedia.getSite(), firstPageTitle).namespace() gen = pagegenerators.AllpagesPageGenerator(firstPageTitle, namespace) elif arg.startswith('-google'): if len(arg) >= 8: googleQuery = arg[8:] gen = pagegenerators.GoogleSearchPageGenerator(googleQuery) elif arg.startswith('-except:'): exceptions.append(arg[8:]) elif arg.startswith('-fix:'): fix = arg[5:] elif arg == '-always': acceptall = True elif arg.startswith('-namespace:'): namespaces.append(int(arg[11:])) else: commandline_replacements.append(arg) if (len(commandline_replacements) == 2 and fix == None): replacements.append((commandline_replacements[0], commandline_replacements[1])) wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg ) % ' (-' + commandline_replacements[0] + ' +' + commandline_replacements[1] + ')') elif fix == None: old = wikipedia.input(u'Please enter the text that should be replaced:') new = wikipedia.input(u'Please enter the new text:') change = '(-' + old + ' +' + new replacements.append((old, new)) while True: old = wikipedia.input(u'Please enter another text that should be replaced, or press Enter to start:') if old == '': change = change + ')' break new = wikipedia.input(u'Please enter the new text:') change = change + ' & -' + old + ' +' + new replacements.append((old, new)) default_summary_message = wikipedia.translate(wikipedia.getSite(), msg) % change wikipedia.output(u'The summary message will default to: %s' % default_summary_message) summary_message = wikipedia.input(u'Press Enter to use this default message, or enter a description of the changes your bot will make:') if summary_message == '': summary_message = default_summary_message wikipedia.setAction(summary_message) else: # Perform one of the predefined actions. try: fix = fixes[fix] except KeyError: wikipedia.output(u'Available predefined fixes are: %s' % fixes.keys()) wikipedia.stopme() sys.exit() if fix.has_key('regex'): regex = fix['regex'] if fix.has_key('msg'): wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), fix['msg'])) if fix.has_key('exceptions'): exceptions = fix['exceptions'] replacements = fix['replacements'] # already compile all regular expressions here to save time later for i in range(len(replacements)): old, new = replacements[i] if not regex: old = re.escape(old) oldR = re.compile(old, re.UNICODE) replacements[i] = oldR, new for i in range(len(exceptions)): exception = exceptions[i] if not regex: exception = re.escape(exception) exceptionR = re.compile(exception, re.UNICODE) exceptions[i] = exceptionR if xmlFilename: gen = XmlDumpReplacePageGenerator(xmlFilename, replacements, exceptions) elif useSql: whereClause = 'WHERE (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) if exceptions: exceptClause = 'AND NOT (%s)' % ' OR '.join(["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions]) else: exceptClause = '' query = u""" SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) %s %s LIMIT 200""" % (whereClause, exceptClause) gen = pagegenerators.MySQLPageGenerator(query) elif PageTitles: pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles] gen = iter(pages) if not gen: # syntax error, show help text from the top of this file wikipedia.output(__doc__, 'utf-8') wikipedia.stopme() sys.exit() if namespaces != []: gen = pagegenerators.NamespaceFilterPageGenerator(gen, namespaces) preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20) bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall) bot.run() if __name__ == "__main__": try: main() finally: wikipedia.stopme()