User:JarektUploadBot/FixWGAMetadataArt.py
Appearance
<syntaxhighlight copy line lang="python">
- !/usr/bin/python
- -*- coding: utf-8 -*-
A program to upload all the images in the Web Gallery of Art website at http://www.wga.hu/
import sys, os.path, glob, re, hashlib, base64, StringIO, time sys.path.append("C:/Programs/pywikipedia/") sys.path.append("../") import wikipedia, upload, csv, urllib2, string, catlib
def processFile(row):
# Read line of metadata
enc='utf-8'
metadata = {
'IMG_ID' : int (row.get(u'IMG_ID') ),
'CREATOR' : unicode(row.get(u'CREATOR') , enc),
'DATE' : unicode(row.get(u'DATE') , enc),
'TITLE' : unicode(row.get(u'TITLE') , enc),
'DIMENSIONS' : unicode(row.get(u'DIMENSIONS') , enc),
'TECHNIQUE' : unicode(row.get(u'TECHNIQUE') , enc),
'FILENAME' : unicode(row.get(u'FILENAME') , enc),
'FILENAME2' : unicode(row.get(u'FILENAME2') , enc),
'FORM' : unicode(row.get(u'FORM') , enc),
'TYPE' : unicode(row.get(u'TYPE') , enc),
'SCHOOL' : unicode(row.get(u'SCHOOL') , enc),
'TIMELINE' : unicode(row.get(u'TIMELINE') , enc),
'INSTITUTION' : unicode(row.get(u'INSTITUTION') , enc),
'CREATOR_CAT' : unicode(row.get(u'CREATOR_CAT') , enc),
'INSTITUTION_CAT' : unicode(row.get(u'INSTITUTION_CAT'), enc),
'TITLE_CAT' : unicode(row.get(u'TITLE_CAT') , enc),
'DATE_CAT' : unicode(row.get(u'DATE_CAT') , enc),
'URL' : unicode(row.get(u'URL') , enc),
'IMAGEURL' : unicode(row.get(u'IMAGEURL') , enc),
'FRAME' : unicode(row.get(u'FRAME') , enc),
}
metadata['FORM1'] = metadata['FORM'].capitalize();
metadata['FILENAME2'] = 'File:'+metadata['FILENAME2'].strip();
metadata['CREATOR'] = metadata['CREATOR'].strip();
metadata['INSTITUTION'] = metadata['INSTITUTION'].strip();
# Get current file description
targetSite = wikipedia.getSite('commons', 'commons')
page = wikipedia.Page(targetSite, metadata['FILENAME2'])
desc = page.get()
wikipedia.output("================================================================================" )
wikipedia.output("=== BEFORE %(FILENAME2)s"%metadata )
wikipedia.output("================================================================================" )
wikipedia.output(desc)
# Merge sources if original was from ww.allartpainting.com or The Yorck Project, both of those sources sometimes had the same binaries as WGA
metadata['SOURCE'] = u"Web Gallery of Art:
[%(IMAGEURL)s Image]
[%(URL)s Info about artwork]
" % metadata
m=re.search("http:\/\/www\.allartpainting\.com\/[^\n\]\s]*", desc)
if m!=None:
metadata['SOURCE'] = u"""
- [%s www.allartpainting.com]
- %s""" % (m.group(0), metadata['SOURCE'])
else:
m=re.search("The Yorck Project", desc)
if m!=None:
metadata['SOURCE'] = u"""
- The Yorck Project: 10.000 Meisterwerke der Malerei. DVD-ROM, 2002. ISBN 3936122202. Distributed by DIRECTMEDIA Publishing GmbH.
- %s""" % (metadata['SOURCE'])
# If origial description used institution template than use that one
desc = string.replace(desc, "{{:museum:", "{{Institution:")
m=re.search("\{\{[Ii]nstitution:", desc)
if m!=None:
metadata['INSTITUTION'] = ""
# If origial description used creator template than use that one
m=re.search("[Cc]reator:", desc)
if m!=None:
metadata['CREATOR'] = ""
# If origial description used technique template than use that one
m=re.search("\{\{[Tt]echnique\|", desc)
if m!=None:
metadata['TECHNIQUE'] = ""
# Format file description
article_template = u"""{{subst:User:Jarekt/WGA
|CREATOR = %(CREATOR)s
|TITLE = %(TITLE)s
|DATE = %(DATE)s
|TECHNIQUE = %(TECHNIQUE)s
|DIMENSIONS = %(DIMENSIONS)s
|INSTITUTION = %(INSTITUTION)s
|FRAME = %(FRAME)s
|FORM = %(FORM)s
|TYPE = %(TYPE)s
|SCHOOL = %(SCHOOL)s
|TIMELINE = %(TIMELINE)s
|SOURCE = %(SOURCE)s
"""
description = string.replace(desc,"{{Artwork", article_template % metadata)
#get files categories and parent categories of those
parentCats=
for m in re.finditer("\[\[[Cc]ategory:([^\]\|]*)", desc):
cat = u'Category:%s\n'%m.groups(0)[0]
parentCats += cat
catO = catlib.Category(targetSite,cat)
for parent in catO.supercategoriesList():
parentCats += parent.title() +'\n'
wikipedia.output("================================================================================" )
wikipedia.output("=== parent =====================================================================" )
wikipedia.output("================================================================================" )
wikipedia.output(parentCats)
#get file's categories in wikitext format with sort order (if any)
cats = u"""
""" % metadata
#If try adding categories to the existing cats set
cat = u'%(TITLE_CAT)s'%metadata
if (cat not in parentCats):
cats += '\n'
cat = u'%(DATE_CAT)s'%metadata
if (cat not in parentCats):
cats += '\n'
cat = u'%(CREATOR_CAT)s'%metadata
p=cat.rpartition(' ')
if (p[2] not in parentCats):
cats += u'\n'%metadata
cat = u'%(INSTITUTION_CAT)s'%metadata
if (cat not in parentCats):
cats += u'\n'%metadata
cats = string.replace(cats, " |", "|")
cats = string.replace(cats, "| ", "|")
cats = string.replace(cats, "Paintings by ", "Paintings by ")
cats = string.replace(cats, "[[Category:]]\n", "")
cats = string.replace(cats, "[[Category:]]", "")
cats = string.replace(cats, "[[Category: ", "[[Category:")
cats = string.replace(cats, "[[Category::", "[[Category:")
description = description + cats
description = string.replace(description, "","")
description = string.replace(description, "{{}}", "")
#description = string.replace(description, "\n\n", "\n")
wikipedia.output("================================================================================" )
wikipedia.output("=== AFTER ======================================================================" )
wikipedia.output("================================================================================" )
wikipedia.output(description)
page.put(description, "Update metadata and categories. Please check!", True, False)
time.sleep(30)
def main(args):
csvFile = 'WGA_artbatch2.csv' reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')
for row in reader:
try:
processFile(row)
except csv.Error, e:
wikipedia.output('skip')
if __name__ == "__main__":
try:
main(sys.argv[1:])
finally:
print "All done!"