From 6c4c0be691a1971b54b47cc1f4891077f8ebf90e Mon Sep 17 00:00:00 2001 From: Brian Peterson Date: Mon, 1 Sep 2014 15:40:45 -0500 Subject: [PATCH 1/3] more changes to command; ebtter but needs effiency improvement --- .../management/commands/chrgdesc2category.py | 88 ++++++++++++++----- 1 file changed, 67 insertions(+), 21 deletions(-) diff --git a/convictions_data/management/commands/chrgdesc2category.py b/convictions_data/management/commands/chrgdesc2category.py index 1df872d..1b0d3a4 100644 --- a/convictions_data/management/commands/chrgdesc2category.py +++ b/convictions_data/management/commands/chrgdesc2category.py @@ -3,65 +3,111 @@ from django.core.management.base import BaseCommand from django.db import transaction -from convictions_data.models import Disposition +from convictions_data.models import Conviction from convictions_data.statute import (get_iucr, IUCRLookupError, ILCSLookupError, StatuteFormatError) -from pprint import pprint +from itertools import chain import json -# can't find a handler? -#logger = logging.getLogger(__name__) - -def append_or_create(dict, chrgdesc, category): +def update_or_create(dict, chrgdesc, category): if category: try: categories = dict[chrgdesc] if category not in categories: + # uncomment next line to see multiple IUCR categories acrue + #import pdb; pdb.set_trace() dict[chrgdesc].append(category) except KeyError: dict[chrgdesc] = [category] else: - # warn if there's no IUCR category for this disposition + # warn if there's no IUCR category for this conviction assert False +FIRST_TIME = True + +def true_once(): + global FIRST_TIME + if FIRST_TIME: + FIRST_TIME = not FIRST_TIME + return True + else: + return False + + class Command(BaseCommand): help = "Map charge descriptions to iucr categories." def handle(self, *args, **options): + print('inside the command') + chrgdesc_to_category = {} - for disposition in Disposition.objects.all(): + for conviction in Conviction.objects.all(): - chrgdesc = disposition.ammndchrgdescr if \ - disposition.ammndchrgdescr else disposition.chrgdesc - category = disposition.iucr_category + if true_once(): + print('inside the iteration') - case_number = disposition.case_number - statute = disposition.final_statute if \ - disposition.final_statute else disposition.statute - chrgdisp = disposition.chrgdisp - chrgdispdate = disposition.chrgdispdate + chrgdesc = conviction.final_chrgdesc + category = conviction.iucr_category + + case_number = conviction.case_number + statute = conviction.final_statute + chrgdispdate = conviction.chrgdispdate try: - append_or_create(chrgdesc_to_category, chrgdesc, category) + update_or_create(chrgdesc_to_category, chrgdesc, category) except AssertionError: - # print('No IUCR category for disposition: {} {} {} {}' - # .format(case_number, statute, chrgdispdate, chrgdisp)) - pass + + #print ('NO IUCR found for conviction: {}').format(conviction) + offense_tuples = set() + for conviction in Conviction.objects.filter(final_chrgdesc=chrgdesc): + try: + offense_tuples.add(tuple(get_iucr(conviction.final_statute))) + except IUCRLookupError: + #print ("UNKNOWN CODE associated /w chrgdesc: {}").format(chrgdesc) + pass + except ILCSLookupError: + #print ("UNKNOWN STATUTE associated /w chrgdesc: {}").format(chrgdesc) + pass + except StatuteFormatError: + #print ("UNKNOWN FORAMT associated /w chrgdesc: {}").format(chrgdesc) + pass + + if len(offense_tuples) == 1: + #print 'SAME IUCRs from every conviction' + offenses = list(offense_tuples)[0] + if len(set(o.offense_category for o in offenses)) == 1: + #print 'SAME CATEGORY for all the IUCRs' + category = offenses[0].offense_category + if category: + #print('SUCCESS: All convictions /w this chrgdesc ({}) have the ' + #'same iucr category ({})').format(chrgdesc, category) + update_or_create(chrgdesc_to_category, chrgdesc, category) + else: + #print 'something weird is going on' + pass + else: + #print "Couldn't get an IUCR category for this" + pass print('num total: ', len(chrgdesc_to_category)) + print 'writing file with all' with open('chrgdesc_to_category__all.json', 'w') as f: json.dump(chrgdesc_to_category, f) + print 'sorting' chrgdesc_to_category = {x: chrgdesc_to_category[x] for x in chrgdesc_to_category.keys() if len(chrgdesc_to_category[x]) > 1} + print('num with multiple: ', len(chrgdesc_to_category)) + print 'writing mutliples file' with open('chrgdesc_to_category__multiples.json', 'w') as f: - print('num with multiple: ', len(chrgdesc_to_category)) json.dump(chrgdesc_to_category, f) + print 'done' + From ddc8c3f5c8c961893a411988a93cd47c843bbd31 Mon Sep 17 00:00:00 2001 From: Brian Peterson Date: Tue, 2 Sep 2014 12:13:12 -0500 Subject: [PATCH 2/3] get chrgdesc2category command to reveal real multiple possible categories given a chrgdesc --- .../management/commands/chrgdesc2category.py | 136 ++++++++++-------- 1 file changed, 74 insertions(+), 62 deletions(-) diff --git a/convictions_data/management/commands/chrgdesc2category.py b/convictions_data/management/commands/chrgdesc2category.py index 1b0d3a4..2f3d7d4 100644 --- a/convictions_data/management/commands/chrgdesc2category.py +++ b/convictions_data/management/commands/chrgdesc2category.py @@ -1,30 +1,14 @@ -import logging +from __future__ import division from django.core.management.base import BaseCommand -from django.db import transaction from convictions_data.models import Conviction from convictions_data.statute import (get_iucr, IUCRLookupError, ILCSLookupError, StatuteFormatError) -from itertools import chain import json -def update_or_create(dict, chrgdesc, category): - if category: - try: - categories = dict[chrgdesc] - if category not in categories: - # uncomment next line to see multiple IUCR categories acrue - #import pdb; pdb.set_trace() - dict[chrgdesc].append(category) - except KeyError: - dict[chrgdesc] = [category] - else: - # warn if there's no IUCR category for this conviction - assert False - FIRST_TIME = True def true_once(): @@ -36,75 +20,103 @@ def true_once(): return False +def div(x, y): + try: + return round((x / y), 4) * 100 + except ZeroDivisionError: + return 0 + + +def update_or_create(dict, chrgdesc, new_categories): + try: + present_categories = dict[chrgdesc] + for c in new_categories: + if c not in present_categories: + dict[chrgdesc].append(c) + except KeyError: + dict[chrgdesc] = new_categories + + class Command(BaseCommand): - help = "Map charge descriptions to iucr categories." + help = \ + """ + Try to generate, as close as possible, + a one-to-one mapping between charge + descriptions and iucr categories. + """ def handle(self, *args, **options): print('inside the command') chrgdesc_to_category = {} + hit = 0 + + convictions = Conviction.objects.all() + total = convictions.count() - for conviction in Conviction.objects.all(): + for i, conviction in enumerate(convictions): if true_once(): print('inside the iteration') chrgdesc = conviction.final_chrgdesc category = conviction.iucr_category - - case_number = conviction.case_number statute = conviction.final_statute - chrgdispdate = conviction.chrgdispdate - - try: - update_or_create(chrgdesc_to_category, chrgdesc, category) - except AssertionError: - - #print ('NO IUCR found for conviction: {}').format(conviction) - offense_tuples = set() - for conviction in Conviction.objects.filter(final_chrgdesc=chrgdesc): - try: - offense_tuples.add(tuple(get_iucr(conviction.final_statute))) - except IUCRLookupError: - #print ("UNKNOWN CODE associated /w chrgdesc: {}").format(chrgdesc) - pass - except ILCSLookupError: - #print ("UNKNOWN STATUTE associated /w chrgdesc: {}").format(chrgdesc) - pass - except StatuteFormatError: - #print ("UNKNOWN FORAMT associated /w chrgdesc: {}").format(chrgdesc) - pass - - if len(offense_tuples) == 1: - #print 'SAME IUCRs from every conviction' - offenses = list(offense_tuples)[0] - if len(set(o.offense_category for o in offenses)) == 1: - #print 'SAME CATEGORY for all the IUCRs' + + # if exactly one IUCR code / category is associated + # with this conviction, map it to the conviction's + # charge description; + + # also make sure that the category can be found in the crosswalk's + # list of possible categories and is not just somehow in the database + if category and category not in [o.offense_category for o in get_iucr(statute)]: + category = '' + + if category: + update_or_create(chrgdesc_to_category, chrgdesc, [category]) + hit += 1 + + # otherwise, check if the conviction doesn't have an IUCR + # because multiple possible IUCRs matched the conviction's statute + else: + try: + offenses = get_iucr(statute) + except IUCRLookupError: + continue + except ILCSLookupError: + continue + except StatuteFormatError: + continue + + if len(offenses) >= 1: + # if so, check if all possible IUCRs associated with + # that statute map to a single charge description; + if len(set([o.offense_category for o in offenses])) == 1: category = offenses[0].offense_category - if category: - #print('SUCCESS: All convictions /w this chrgdesc ({}) have the ' - #'same iucr category ({})').format(chrgdesc, category) - update_or_create(chrgdesc_to_category, chrgdesc, category) - else: - #print 'something weird is going on' - pass + + update_or_create(chrgdesc_to_category, chrgdesc, [category]) + hit += 1 else: - #print "Couldn't get an IUCR category for this" - pass + categories = list(set([o.offense_category for o in offenses])) + update_or_create(chrgdesc_to_category, chrgdesc, categories) - print('num total: ', len(chrgdesc_to_category)) + print "{}% one-to-one mapping".format(div(hit, i)) + + print 'num total: ', len(chrgdesc_to_category) print 'writing file with all' with open('chrgdesc_to_category__all.json', 'w') as f: json.dump(chrgdesc_to_category, f) - print 'sorting' - chrgdesc_to_category = {x: chrgdesc_to_category[x] for x in chrgdesc_to_category.keys() if len(chrgdesc_to_category[x]) > 1} + chrgdesc_to_category_multiples = {x: chrgdesc_to_category[x] for x in chrgdesc_to_category.keys() if len(chrgdesc_to_category[x]) > 1} - print('num with multiple: ', len(chrgdesc_to_category)) - print 'writing mutliples file' + print 'num chrgdesc that map to multiple possible IUCR categories: ', len(chrgdesc_to_category_multiples) + print 'writing multiples file' with open('chrgdesc_to_category__multiples.json', 'w') as f: - json.dump(chrgdesc_to_category, f) + json.dump(chrgdesc_to_category_multiples, f) + + print ('num convictions whose chrgdesc maps to multiple possible ' + 'IUCR categories: {}').format(total - hit) print 'done' From f2cfbda85369f7844f217e9ebe3d111c34748fb7 Mon Sep 17 00:00:00 2001 From: Brian Peterson Date: Fri, 12 Sep 2014 03:38:38 -0500 Subject: [PATCH 3/3] add disambiguate command for looking over individual chrgdesc/convction/statute/iucr --- .../management/commands/disambiguate.py | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 convictions_data/management/commands/disambiguate.py diff --git a/convictions_data/management/commands/disambiguate.py b/convictions_data/management/commands/disambiguate.py new file mode 100644 index 0000000..742907d --- /dev/null +++ b/convictions_data/management/commands/disambiguate.py @@ -0,0 +1,120 @@ +from __future__ import division + +from django.core.management.base import BaseCommand +from django.db.models import Count + +from convictions_data.models import Conviction +from convictions_data.statute import get_iucr, IUCRLookupError, \ + ILCSLookupError, StatuteFormatError + +import json, sys + + +def prefix_fmt(num_break): + return ' ' * num_break + +def suffix_fmt(num_break): + return '\n' * num_break + +def fmt(msg, prefix, suffix): + prefix = prefix_fmt(prefix) + suffix = suffix_fmt(suffix) + print "{0}{1} ...{2}".format(prefix, msg, suffix) + +def fmt_item(name, item, prefix, suffix): + if not item: + item = 'This value is empty.' + + msg='{}: {}'.format(name, item) + fmt(msg, prefix, suffix) + +def set_formatting(loop_level): + # no indent, two newlines + # at first loop level + if loop_level == 0: + return 0, 2 + + # one indent, two new lines + # at second loop level + if loop_level == 1: + return 1, 2 + + # two indents, one new line + # at third loop level + if loop_level == 2: + return 2, 1 + + +class Command(BaseCommand): + help = \ + """ + Do some work... + """ + + def handle(self, *args, **options): + + with open('chrgdesc_to_category__multiples.json') as f: + + START_LOOP = 0 + prefix = None + suffix = None + + multiples = json.load(f) + + print '\n' + print 'Total num of multiples: {}'.format(len(multiples)) + print '\n\n' + + for chrgdesc in multiples: + + loop_level = START_LOOP + prefix, suffix = set_formatting(loop_level) + + fmt_item('chrgdesc', chrgdesc, prefix, 0) + + convictions = Conviction.objects.filter(final_chrgdesc=chrgdesc).values('final_statute').annotate(Count('id')).order_by() + num_convictons = convictions.count() + fmt('Num of statutes: {}'.format(num_convictons), prefix, suffix) + + for i, c in enumerate(convictions): + + statute = c['final_statute'] + + loop_level = 1 + prefix, suffix = set_formatting(loop_level) + + fmt_item('statute', statute, prefix, 0) + fmt('Num statutes left: {}'.format(num_convictons - (i+1)), prefix, suffix) + try: + + loop_level = 2 + prefix, suffix = set_formatting(loop_level) + + o_tuples = [(o.code, o.offense_category) for o in get_iucr(statute)] + fmt_item('codes', [o[0] for o in o_tuples], prefix, suffix) + fmt_item('categories', [o[1] for o in o_tuples], prefix, suffix) + + except IUCRLookupError: + fmt('IUCRLookupError occurred', prefix, suffix) + except ILCSLookupError: + fmt('ILCSLookupError occurred', prefix, suffix) + except StatuteFormatError: + fmt('StatuteFormatError occurred', prefix, suffix) + + finally: + + try: + cmd = raw_input('>> ') + except KeyboardInterrupt: + print '\ndone!' + sys.exit(0) + + if cmd == 'n': + break + print '\n' + + print 'done!' + + + +