diff --git a/convictions_data/management/commands/chrgdesc2category.py b/convictions_data/management/commands/chrgdesc2category.py index 1df872d..2f3d7d4 100644 --- a/convictions_data/management/commands/chrgdesc2category.py +++ b/convictions_data/management/commands/chrgdesc2category.py @@ -1,66 +1,124 @@ -import logging +from __future__ import division from django.core.management.base import BaseCommand -from django.db import transaction -from convictions_data.models import Disposition +from convictions_data.models import Conviction from convictions_data.statute import (get_iucr, IUCRLookupError, ILCSLookupError, StatuteFormatError) -from pprint import pprint import json -# can't find a handler? -#logger = logging.getLogger(__name__) +FIRST_TIME = True -def append_or_create(dict, chrgdesc, category): - if category: - try: - categories = dict[chrgdesc] - if category not in categories: - dict[chrgdesc].append(category) - except KeyError: - dict[chrgdesc] = [category] +def true_once(): + global FIRST_TIME + if FIRST_TIME: + FIRST_TIME = not FIRST_TIME + return True else: - # warn if there's no IUCR category for this disposition - assert False + return False -class Command(BaseCommand): - help = "Map charge descriptions to iucr categories." - def handle(self, *args, **options): +def div(x, y): + try: + return round((x / y), 4) * 100 + except ZeroDivisionError: + return 0 - chrgdesc_to_category = {} - for disposition in Disposition.objects.all(): +def update_or_create(dict, chrgdesc, new_categories): + try: + present_categories = dict[chrgdesc] + for c in new_categories: + if c not in present_categories: + dict[chrgdesc].append(c) + except KeyError: + dict[chrgdesc] = new_categories - chrgdesc = disposition.ammndchrgdescr if \ - disposition.ammndchrgdescr else disposition.chrgdesc - category = disposition.iucr_category - case_number = disposition.case_number - statute = disposition.final_statute if \ - disposition.final_statute else disposition.statute - chrgdisp = disposition.chrgdisp - chrgdispdate = disposition.chrgdispdate +class Command(BaseCommand): + help = \ + """ + Try to generate, as close as possible, + a one-to-one mapping between charge + descriptions and iucr categories. + """ + + def handle(self, *args, **options): - try: - append_or_create(chrgdesc_to_category, chrgdesc, category) - except AssertionError: - # print('No IUCR category for disposition: {} {} {} {}' - # .format(case_number, statute, chrgdispdate, chrgdisp)) - pass + print('inside the command') - print('num total: ', len(chrgdesc_to_category)) + chrgdesc_to_category = {} + hit = 0 + + convictions = Conviction.objects.all() + total = convictions.count() + + for i, conviction in enumerate(convictions): + + if true_once(): + print('inside the iteration') + + chrgdesc = conviction.final_chrgdesc + category = conviction.iucr_category + statute = conviction.final_statute + + # if exactly one IUCR code / category is associated + # with this conviction, map it to the conviction's + # charge description; + + # also make sure that the category can be found in the crosswalk's + # list of possible categories and is not just somehow in the database + if category and category not in [o.offense_category for o in get_iucr(statute)]: + category = '' + + if category: + update_or_create(chrgdesc_to_category, chrgdesc, [category]) + hit += 1 + + # otherwise, check if the conviction doesn't have an IUCR + # because multiple possible IUCRs matched the conviction's statute + else: + try: + offenses = get_iucr(statute) + except IUCRLookupError: + continue + except ILCSLookupError: + continue + except StatuteFormatError: + continue + + if len(offenses) >= 1: + # if so, check if all possible IUCRs associated with + # that statute map to a single charge description; + if len(set([o.offense_category for o in offenses])) == 1: + category = offenses[0].offense_category + + update_or_create(chrgdesc_to_category, chrgdesc, [category]) + hit += 1 + else: + categories = list(set([o.offense_category for o in offenses])) + update_or_create(chrgdesc_to_category, chrgdesc, categories) + + print "{}% one-to-one mapping".format(div(hit, i)) + + print 'num total: ', len(chrgdesc_to_category) + print 'writing file with all' with open('chrgdesc_to_category__all.json', 'w') as f: json.dump(chrgdesc_to_category, f) - chrgdesc_to_category = {x: chrgdesc_to_category[x] for x in chrgdesc_to_category.keys() if len(chrgdesc_to_category[x]) > 1} + chrgdesc_to_category_multiples = {x: chrgdesc_to_category[x] for x in chrgdesc_to_category.keys() if len(chrgdesc_to_category[x]) > 1} + print 'num chrgdesc that map to multiple possible IUCR categories: ', len(chrgdesc_to_category_multiples) + print 'writing multiples file' with open('chrgdesc_to_category__multiples.json', 'w') as f: - print('num with multiple: ', len(chrgdesc_to_category)) - json.dump(chrgdesc_to_category, f) + json.dump(chrgdesc_to_category_multiples, f) + + print ('num convictions whose chrgdesc maps to multiple possible ' + 'IUCR categories: {}').format(total - hit) + + print 'done' diff --git a/convictions_data/management/commands/disambiguate.py b/convictions_data/management/commands/disambiguate.py new file mode 100644 index 0000000..742907d --- /dev/null +++ b/convictions_data/management/commands/disambiguate.py @@ -0,0 +1,120 @@ +from __future__ import division + +from django.core.management.base import BaseCommand +from django.db.models import Count + +from convictions_data.models import Conviction +from convictions_data.statute import get_iucr, IUCRLookupError, \ + ILCSLookupError, StatuteFormatError + +import json, sys + + +def prefix_fmt(num_break): + return ' ' * num_break + +def suffix_fmt(num_break): + return '\n' * num_break + +def fmt(msg, prefix, suffix): + prefix = prefix_fmt(prefix) + suffix = suffix_fmt(suffix) + print "{0}{1} ...{2}".format(prefix, msg, suffix) + +def fmt_item(name, item, prefix, suffix): + if not item: + item = 'This value is empty.' + + msg='{}: {}'.format(name, item) + fmt(msg, prefix, suffix) + +def set_formatting(loop_level): + # no indent, two newlines + # at first loop level + if loop_level == 0: + return 0, 2 + + # one indent, two new lines + # at second loop level + if loop_level == 1: + return 1, 2 + + # two indents, one new line + # at third loop level + if loop_level == 2: + return 2, 1 + + +class Command(BaseCommand): + help = \ + """ + Do some work... + """ + + def handle(self, *args, **options): + + with open('chrgdesc_to_category__multiples.json') as f: + + START_LOOP = 0 + prefix = None + suffix = None + + multiples = json.load(f) + + print '\n' + print 'Total num of multiples: {}'.format(len(multiples)) + print '\n\n' + + for chrgdesc in multiples: + + loop_level = START_LOOP + prefix, suffix = set_formatting(loop_level) + + fmt_item('chrgdesc', chrgdesc, prefix, 0) + + convictions = Conviction.objects.filter(final_chrgdesc=chrgdesc).values('final_statute').annotate(Count('id')).order_by() + num_convictons = convictions.count() + fmt('Num of statutes: {}'.format(num_convictons), prefix, suffix) + + for i, c in enumerate(convictions): + + statute = c['final_statute'] + + loop_level = 1 + prefix, suffix = set_formatting(loop_level) + + fmt_item('statute', statute, prefix, 0) + fmt('Num statutes left: {}'.format(num_convictons - (i+1)), prefix, suffix) + try: + + loop_level = 2 + prefix, suffix = set_formatting(loop_level) + + o_tuples = [(o.code, o.offense_category) for o in get_iucr(statute)] + fmt_item('codes', [o[0] for o in o_tuples], prefix, suffix) + fmt_item('categories', [o[1] for o in o_tuples], prefix, suffix) + + except IUCRLookupError: + fmt('IUCRLookupError occurred', prefix, suffix) + except ILCSLookupError: + fmt('ILCSLookupError occurred', prefix, suffix) + except StatuteFormatError: + fmt('StatuteFormatError occurred', prefix, suffix) + + finally: + + try: + cmd = raw_input('>> ') + except KeyboardInterrupt: + print '\ndone!' + sys.exit(0) + + if cmd == 'n': + break + print '\n' + + print 'done!' + + + +