User:JarektUploadBot/FixWGAMetadataArt.py

!/usr/bin/python
-*- coding: utf-8 -*-

A program to upload all the images in the Web Gallery of Art website at http://www.wga.hu/

import sys, os.path, glob, re, hashlib, base64, StringIO, time sys.path.append("C:/Programs/pywikipedia/") sys.path.append("../") import wikipedia, upload, csv, urllib2, string, catlib

def processFile(row):

  # Read line of metadata
  enc='utf-8' 
  metadata = {
      'IMG_ID'          : int    (row.get(u'IMG_ID')              ),       
      'CREATOR'         : unicode(row.get(u'CREATOR')        , enc),       
      'DATE'            : unicode(row.get(u'DATE')           , enc),
      'TITLE'           : unicode(row.get(u'TITLE')          , enc),
      'DIMENSIONS'      : unicode(row.get(u'DIMENSIONS')     , enc),
      'TECHNIQUE'       : unicode(row.get(u'TECHNIQUE')      , enc),
      'FILENAME'        : unicode(row.get(u'FILENAME')       , enc),
      'FILENAME2'       : unicode(row.get(u'FILENAME2')      , enc),
      'FORM'            : unicode(row.get(u'FORM')           , enc),
      'TYPE'            : unicode(row.get(u'TYPE')           , enc),
      'SCHOOL'          : unicode(row.get(u'SCHOOL')         , enc),
      'TIMELINE'        : unicode(row.get(u'TIMELINE')       , enc),
      'INSTITUTION'     : unicode(row.get(u'INSTITUTION')    , enc),
      'CREATOR_CAT'     : unicode(row.get(u'CREATOR_CAT')    , enc),
      'INSTITUTION_CAT' : unicode(row.get(u'INSTITUTION_CAT'), enc),
      'TITLE_CAT'       : unicode(row.get(u'TITLE_CAT')      , enc),
      'DATE_CAT'        : unicode(row.get(u'DATE_CAT')       , enc),
      'URL'             : unicode(row.get(u'URL')            , enc),
      'IMAGEURL'        : unicode(row.get(u'IMAGEURL')       , enc),
      'FRAME'           : unicode(row.get(u'FRAME')          , enc),
      }
  metadata['FORM1']       = metadata['FORM'].capitalize();
  metadata['FILENAME2']   = 'File:'+metadata['FILENAME2'].strip();
  metadata['CREATOR']     = metadata['CREATOR'].strip();
  metadata['INSTITUTION'] = metadata['INSTITUTION'].strip();

   # Get current file description
  targetSite = wikipedia.getSite('commons', 'commons')
  page = wikipedia.Page(targetSite, metadata['FILENAME2'])
  desc = page.get()
  wikipedia.output("================================================================================" )    
  wikipedia.output("=== BEFORE %(FILENAME2)s"%metadata )
  wikipedia.output("================================================================================" )    
  wikipedia.output(desc)   
  
  # Merge sources if original was from ww.allartpainting.com or The Yorck Project, both of those sources sometimes had the same binaries as WGA

metadata['SOURCE'] = u"Web Gallery of Art: [%(IMAGEURL)s Image] [%(URL)s Info about artwork]

" % metadata

  m=re.search("http:\/\/www\.allartpainting\.com\/[^\n\]\s]*", desc)
  if m!=None:
     metadata['SOURCE'] = u"""

[%s www.allartpainting.com]
%s""" % (m.group(0), metadata['SOURCE'])

  else:
    m=re.search("The Yorck Project", desc)
    if m!=None:
      metadata['SOURCE'] = u"""

The Yorck Project: 10.000 Meisterwerke der Malerei. DVD-ROM, 2002. ISBN 3936122202. Distributed by DIRECTMEDIA Publishing GmbH.
%s""" % (metadata['SOURCE'])

  # If origial description used institution template than use that one
  desc = string.replace(desc, "{{:museum:", "{{Institution:")
  m=re.search("\{\{[Ii]nstitution:", desc)
  if m!=None:
    metadata['INSTITUTION'] = ""

  # If origial description used creator template than use that one
  m=re.search("[Cc]reator:", desc)
  if m!=None:
    metadata['CREATOR'] = ""

  # If origial description used technique template than use that one
  m=re.search("\{\{[Tt]echnique\|", desc)
  if m!=None:
    metadata['TECHNIQUE'] = ""
   
  # Format file description
  article_template = u"""{{subst:User:Jarekt/WGA     
 |CREATOR          = %(CREATOR)s
 |TITLE            = %(TITLE)s
 |DATE             = %(DATE)s
 |TECHNIQUE        = %(TECHNIQUE)s
 |DIMENSIONS       = %(DIMENSIONS)s
 |INSTITUTION      = %(INSTITUTION)s
 |FRAME            = %(FRAME)s
 |FORM             = %(FORM)s
 |TYPE             = %(TYPE)s
 |SCHOOL           = %(SCHOOL)s
 |TIMELINE         = %(TIMELINE)s
 |SOURCE           = %(SOURCE)s

"""

  description = string.replace(desc,"{{Artwork", article_template % metadata)

  #get files categories and parent categories of those
  parentCats=
  for m in re.finditer("\[\[[Cc]ategory:([^\]\|]*)", desc):
    cat = u'Category:%s\n'%m.groups(0)[0]
    parentCats += cat
    catO = catlib.Category(targetSite,cat)
    for parent in catO.supercategoriesList():
      parentCats += parent.title() +'\n'
  wikipedia.output("================================================================================" )    
  wikipedia.output("=== parent =====================================================================" )
  wikipedia.output("================================================================================" )    
  wikipedia.output(parentCats)
      
  #get file's categories in wikitext format with sort order (if any)       
  cats = u"""

""" % metadata

  #If try adding categories to the existing cats set 
  cat = u'%(TITLE_CAT)s'%metadata
  if (cat not in parentCats):
     cats += '\n'

  cat = u'%(DATE_CAT)s'%metadata
  if (cat not in parentCats):
     cats += '\n'

  cat = u'%(CREATOR_CAT)s'%metadata
  p=cat.rpartition(' ')
  if (p[2] not in parentCats):
     cats += u'\n'%metadata
     
  cat = u'%(INSTITUTION_CAT)s'%metadata
  if (cat not in parentCats):
     cats += u'\n'%metadata

  cats = string.replace(cats, " |", "|")
  cats = string.replace(cats, "| ", "|")
  cats = string.replace(cats, "Paintings by  ", "Paintings by ")
  cats = string.replace(cats, "[[Category:]]\n", "")
  cats = string.replace(cats, "[[Category:]]", "")
  cats = string.replace(cats, "[[Category: ", "[[Category:")
  cats = string.replace(cats, "[[Category::", "[[Category:")
  description = description + cats
  description = string.replace(description, "","")
  description = string.replace(description, "{{}}", "")
  #description = string.replace(description, "\n\n", "\n")

  wikipedia.output("================================================================================" )    
  wikipedia.output("=== AFTER ======================================================================" )
  wikipedia.output("================================================================================" )    
  wikipedia.output(description)    
  page.put(description, "Update metadata and categories. Please check!", True, False)
  time.sleep(30)

def main(args):

  csvFile = 'WGA_artbatch2.csv'

  reader = csv.DictReader(open(csvFile, "rb"), dialect='excel', delimiter=',')

  for row in reader:
    try:
      processFile(row)
    except csv.Error, e:
      wikipedia.output('skip')

if __name__ == "__main__":

  try:
      main(sys.argv[1:])
  finally:
      print "All done!"