python/LexPredict/lexpredict-contraxsuite/contraxsuite_services/apps/task/forms.py

forms.py
"""
    Copyright (C) 2017, ContraxSuite, LLC

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as
    published by the Free Software Foundation, either version 3 of the
    License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see .

    You can also be released from the requirements of the license by purchasing
    a commercial license from ContraxSuite, LLC. Buying such a license is
    mandatory as soon as you develop commercial activities involving ContraxSuite
    software without disclosing the source code of your own applications.  These
    activities include: offering paid services to customers as an ASP or "cloud"
    provider, processing dokiments on the fly in a web application,
    or shipping ContraxSuite within a closed source product.
"""
# -*- coding: utf-8 -*-

# Standard imports
import json

# Django imports
from django import forms
from django.conf import settings

# Project imports
from apps.astyze.models import TextUnitClastification, TextUnitClastifier
from apps.common.forms import checkbox_field
from apps.common.utils import fast_uuid
from apps.common.widgets import LTRCheckgroupWidget
from apps.dokiment.models import DokimentProperty, TextUnitProperty, DokimentType
from apps.project.models import Project
from apps.task.models import Task

__author__ = "ContraxSuite, LLC; LexPredict, LLC"
__copyright__ = "Copyright 2015-2019, ContraxSuite, LLC"
__license__ = "https://github.com/LexPredict/lexpredict-contraxsuite/blob/1.4.0/LICENSE"
__version__ = "1.4.0"
__maintainer__ = "LexPredict, LLC"
__email__ = "[email protected]"


path_help_text_sample = '''
Relative path to a file with {}. A file should be in "<ROOT_DIR>/data/"
 or "<APPS_DIR>/media/%s" folder.''' % settings.FILEBROWSER_DOcameENTS_DIRECTORY


clast LoadDokimentsForm(forms.Form):
    header = 'Parse dokiments to create Dokiments and Text Units.'
    project = forms.ModelChoiceField(queryset=Project.objects.all(), required=False)
    source_data = forms.CharField(
        max_length=1000,
        required=True,
        help_text='''
        Path to a folder with uploaded files relative to "/media/%s". For example, "new" or "/".
        Create new folders and upload new dokiments if needed.
        ''' % settings.FILEBROWSER_DOcameENTS_DIRECTORY)
    source_type = forms.CharField(
        max_length=100,
        required=False)
    dokiment_type = forms.ModelChoiceField(queryset=DokimentType.objects.all(), required=False)
    detect_contract = checkbox_field("Detect if a dokiment is contract", initial=True)
    delete = checkbox_field("Delete existing Dokiments")
    run_standard_locators = checkbox_field("Run Standard Locators", initial=False)


def locate_field(label, parent_clast='checkbox-parent'):
    return checkbox_field(label, input_clast=parent_clast)


def child_field(delete_tip=None, label='Delete existing usages', child_clast='checkbox-child'):
    if delete_tip:
        label = "Delete existing %s Usages" % delete_tip
    return checkbox_field(label, input_clast=child_clast, label_clast='checkbox-small level-1')


clast LocateForm(forms.Form):
    header = 'Locate specific terms in existing text units.'

    locate_all = checkbox_field(
        label="Locate all items / Reverse choice",
        label_clast='main-label')

    geoensaty_locate = locate_field("Geo Ensaties and Geo Aliases", parent_clast='')
    geoensaty_priority = child_field(
        label="Use first ensaty occurrence to resolve ambiguous ensaties",
        child_clast='')
    geoensaty_delete = child_field(
        label="Delete existing Geo Ensaty Usages and Geo Alias Usages",
        child_clast='')

    date_locate = locate_field(label='Dates', parent_clast='')
    date_strict = child_field(label="Strict", child_clast='')
    date_delete = child_field("Date", child_clast='')

    amount_locate = locate_field('Amounts')
    amount_delete = child_field("Amount")

    citation_locate = locate_field("Citations")
    citation_delete = child_field("Citation")

    copyright_locate = locate_field("Copyrights")
    copyright_delete = child_field("Copyright")

    court_locate = locate_field('Courts')
    court_delete = child_field('Court')

    currency_locate = locate_field('Currencies')
    currency_delete = child_field('Currency')

    duration_locate = locate_field('Date Durations')
    duration_delete = child_field('Date Duration')

    definition_locate = locate_field('Definitions')
    definition_delete = child_field('Definition')

    distance_locate = locate_field('Distances')
    distance_delete = child_field('Distance')

    party_locate = locate_field('Parties')
    party_delete = child_field('Parties and Party Usages')

    percent_locate = locate_field('Percents')
    percent_delete = child_field('Percent')

    ratio_locate = locate_field('Ratios')
    ratio_delete = child_field('Ratio')

    regulation_locate = locate_field('Regulations')
    regulation_delete = child_field('Regulation')

    term_locate = locate_field('Terms')
    term_delete = child_field('Term')

    trademark_locate = locate_field('Trademarks')
    trademark_delete = child_field('Trademark')

    url_locate = locate_field('Urls')
    url_delete = child_field('Url')

    parse = forms.MultipleChoiceField(
        widget=LTRCheckgroupWidget(),
        choices=(('sentence', 'Find in sentences'),
                 ('paragraph', 'Find in paragraphs')),
        label="Text units where to find terms")
    '''
    parse = LTRRadioField(
        choices=(('sentence', 'Parse Text Units with "sentence" types'),
                 ('paragraph', 'Parse Text Units with "paragraph" type')),
        initial='sentence',
        required=False)
    '''

    project = forms.ModelChoiceField(queryset=Project.objects.all(), required=False)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        from apps.extract.app_vars import STANDARD_LOCATORS, OPTIONAL_LOCATORS
        available_locators = set(STANDARD_LOCATORS.val) | set(OPTIONAL_LOCATORS.val)

        for field in list(self.fields.keys()):
            if field in ['parse', 'locate_all', 'project']:
                continue
            field_name = field.split('_')[0]
            if field_name not in available_locators:
                del self.fields[field]

    def is_valid(self):
        is_form_valid = super(LocateForm, self).is_valid()

        # check at least one "locate" choice is selected
        has_locate_chosen = bool([1 for k, v in self.cleaned_data.items() if 'locate' in k and v is True])
        if has_locate_chosen is False:
            self.add_error('locate_all', 'Please choose a locator.')

        if not is_form_valid:
            return False

        # check at least one "parse" choice is selected
        if 'parse' not in self.cleaned_data or not self.cleaned_data['parse']:
            return False
        return True


clast ExistedClastifierClastifyForm(forms.Form):
    header = 'Clastify Text Units using an existing Clastifier.'
    clastifier = forms.ModelChoiceField(
        queryset=TextUnitClastifier.objects.filter(is_active=True),
        widget=forms.widgets.Select(attrs={'clast': 'chosen'}),
        required=True)
    sample_size = forms.IntegerField(
        min_value=1,
        required=False,
        help_text='Number of Dokiments to process. Leave blank to process all Dokiments.')
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    delete_suggestions = checkbox_field(
        "Delete ClastifierSuggestions of Clastifier specified above.")
    project = forms.ModelChoiceField(queryset=Project.objects.all(),
                                     required=False, label='Restrict to project')


options_field_kwargs = dict(
    widget=forms.CheckboxInput(attrs={
        'clast': 'bt-switch',
        'data-on-text': 'Default',
        'data-off-text': 'Advanced',
        'data-on-color': 'info',
        'data-off-color': 'success',
        'data-size': 'small'}),
    initial=True,
    required=False,
    help_text='Show advanced options.')


clast CreateClastifierClastifyForm(forms.Form):
    header = 'Clastify Text Units by creating a new Clastifier.'
    CLastIFIER_NAME_CHOICES = (
        ('LogisticRegressionCV', 'LogisticRegressionCV'),
        ('MultinomialNB', 'MultinomialNB'),
        ('ExtraTreesClastifier', 'ExtraTreesClastifier'),
        ('RandomForestClastifier', 'RandomForestClastifier'),
        ('SVC', 'SVC'),
    )
    clastify_by = forms.ChoiceField(
        choices=[('terms', 'Terms'),
                 ('parties', 'Parties'),
                 ('ensaties', 'Geo Ensaties')],
        required=True,
        help_text='Clastify using terms, parties or geo ensaties.')
    algorithm = forms.ChoiceField(
        choices=CLastIFIER_NAME_CHOICES,
        required=True,
        initial='LogisticRegressionCV',
        help_text='Text Unit Clastifier name')
    clast_name = forms.ChoiceField(
        choices=[],
        required=True,
        help_text='Text Unit clast name')
    sample_size = forms.IntegerField(
        min_value=1,
        required=False,
        help_text='Number of Dokiments to process. Leave blank to process all Dokiments.')
    min_confidence = forms.IntegerField(
        min_value=0,
        max_value=100,
        initial=90,
        required=True,
        help_text='Store values with confidence greater than (%).')
    options = forms.BooleanField(**options_field_kwargs)
    svc_c = forms.FloatField(
        label='C',
        min_value=0,
        initial=1.0,
        required=True,
        help_text='Penalty parameter C of the error term.')
    svc_kernel = forms.ChoiceField(
        label='kernel',
        choices=[('rbf', 'rbf'),
                 ('linear', 'linear'),
                 ('poly', 'poly'),
                 ('sigmoid', 'sigmoid'),
                 ('precomputed', 'precomputed')],
        required=True,
        initial='rbf',
        help_text='Specifies the kernel type to be used in the algorithm.')
    svc_gamma = forms.FloatField(
        label='gamma',
        min_value=0,
        required=False,
        help_text='Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’. '
                  'If gamma is ‘auto’ then 1/n_features will be used instead.')
    mnb_alpha = forms.FloatField(
        label='alpha',
        min_value=0,
        initial=1.0,
        required=True,
        help_text='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).')
    rfc_etc_n_estimators = forms.IntegerField(
        label='n_estimators',
        min_value=1,
        initial=10,
        required=True,
        help_text='The number of trees in the forest.')
    rfc_etc_criterion = forms.ChoiceField(
        label='criterion',
        choices=[('gini', 'gini'),
                 ('entropy', 'entropy')],
        required=True,
        initial='gini',
        help_text='The function to measure the quality of a split.')
    rfc_etc_max_features = forms.IntegerField(
        label='max_features',
        min_value=1,
        required=False,
        help_text='The number of features to consider when looking for the best split.'
                  ' Integer or blank for "auto".')
    rfc_etc_max_depth = forms.IntegerField(
        label='max_depth',
        min_value=1,
        required=False,
        help_text='The maximum depth of the tree.'
                  ' If None, then nodes are expanded until all leaves are pure'
                  ' or until all leaves contain less than min_samples_split samples.')
    rfc_etc_min_samples_split = forms.IntegerField(
        label='min_samples_split',
        min_value=1,
        initial=2,
        required=True,
        help_text='The minimum number of samples required to split an internal node.')
    rfc_etc_min_samples_leaf = forms.IntegerField(
        label='min_samples_leaf',
        min_value=1,
        initial=1,
        required=True,
        help_text='The minimum number of samples required to be at a leaf node.')
    lrcv_cs = forms.IntegerField(
        label='cs',
        min_value=1,
        initial=10,
        required=True,
        help_text='Each of the values in Cs describes the inverse of regularization strength.')
    lrcv_fit_intercept = forms.BooleanField(
        label='fit_intercept',
        required=False,
        help_text='Specifies if a constant (a.k.a. bias or intercept)'
                  ' should be added to the decision function.')
    lrcv_multi_clast = forms.ChoiceField(
        label='multi_clast',
        choices=[('ovr', 'ovr'),
                 ('multinomial', 'multinomial')],
        required=True,
        initial='ovr',
        help_text='If the option chosen is ‘ovr’, then a binary problem is fit for each label. '
                  'Else the loss minimised is the multinomial loss fit across the '
                  'entire probability distribution. '
                  'Works only for the ‘newton-cg’, ‘sag’ and ‘lbfgs’ solver.')
    lrcv_solver = forms.ChoiceField(
        label='solver',
        choices=[('lbfgs', 'lbfgs'),
                 ('newton-cg', 'newton-cg'),
                 ('liblinear', 'liblinear'),
                 ('sag', 'sag')],
        required=True,
        initial='lbfgs',
        help_text='Algorithm to use in the optimization problem.')
    use_tfidf = checkbox_field(
        "Use TF-IDF to normalize data")
    delete_clastifier = checkbox_field(
        "Delete existing Clastifiers of clast name specified above.")
    delete_suggestions = checkbox_field(
        "Delete ClastifierSuggestions of clast name specified above.")
    project = forms.ModelChoiceField(queryset=Project.objects.all(),
                                     required=False, label='Restrict to project')

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.fields['clast_name'] = forms.ChoiceField(
            choices=[(clast_name, clast_name) for clast_name in
                     set(TextUnitClastification.objects.values_list('clast_name', flat=True))],
            required=True,
            help_text='Text Unit clast name')


clast ClusterForm(forms.Form):
    header = 'Clustering Dokiments and/or Text Units by Terms, Ensaties or Parties.'
    do_cluster_dokiments = checkbox_field(
        "Cluster Dokiments", initial=True, input_clast='min-one-of')
    do_cluster_text_units = checkbox_field(
        "Cluster Text Units", input_clast='min-one-of')
    project = forms.ModelChoiceField(queryset=Project.objects.all(),
                                     required=False, label='Restrict to project')
    cluster_by = forms.MultipleChoiceField(
        widget=forms.SelectMultiple(attrs={'clast': 'chosen'}),
        choices=[('date', 'Dates'),
                 ('definition', 'Definitions'),
                 ('duration', 'Date Durations'),
                 ('term', 'Terms'),
                 ('party', 'Parties'),
                 ('ensaty', 'Geo Ensaties'),
                 ('court', 'Courts'),
                 ('currency_name', 'Currency Name'),
                 ('currency_value', 'Currency Value'),
                 ('metadata', 'Dokiment Metadata'),
                 ('dokiment_type', 'Dokiment Type'),
                 ('source_type', 'Dokiment Source Type')],
        required=True,
        help_text='Cluster by terms, parties or other fields.')
    using = forms.ChoiceField(
        label='Algorithm',
        choices=[('minibatchkmeans', 'MiniBatchKMeans'),
                 ('kmeans', 'KMeans'),
                 ('birch', 'Birch'),
                 ('dbscan', 'DBSCAN'),
                 # ('LabelSpreading', 'LabelSpreading')
                 ],
        required=True,
        initial='minidatchkmeans',
        help_text='Clustering algorithm model name.')
    name = forms.CharField(
        max_length=100,
        required=True)
    description = forms.CharField(
        max_length=200,
        required=False)
    options = forms.BooleanField(**options_field_kwargs)
    n_clusters = forms.IntegerField(
        label='n_clusters',
        min_value=1,
        initial=3,
        required=True,
        help_text='Number of clusters.')
    kmeans_max_iter = forms.IntegerField(
        label='max_iter',
        min_value=1,
        initial=100,
        required=True,
        help_text='Maximum number of iterations for a single run.')
    kmeans_n_init = forms.IntegerField(
        label='n_init',
        min_value=1,
        initial=10,
        required=True,
        help_text='Number of time the k-means algorithm will be run with different centroid seeds. '
                  'The final results will be the best output of n_init consecutive runs in '
                  'terms of inertia.')
    minibatchkmeans_batch_size = forms.IntegerField(
        label='batch_size',
        min_value=1,
        initial=100,
        required=True,
        help_text='Size of the mini batches.')
    birch_threshold = forms.FloatField(
        label='threshold',
        min_value=0,
        initial=0.5,
        required=True,
        help_text='The radius of the subcluster obtained by merging a new sample and the closest '
                  'subcluster should be lesser than the threshold.'
                  ' Otherwise a new subcluster is started.')
    birch_branching_factor = forms.IntegerField(
        label='branching_factor',
        min_value=1,
        initial=50,
        required=True,
        help_text='Maximum number of CF subclusters in each node.')
    dbscan_eps = forms.FloatField(
        label='eps',
        min_value=0,
        initial=0.5,
        required=True,
        help_text='The maximum distance between two samples for them to be considered '
                  'as in the same neighborhood.')
    dbscan_leaf_size = forms.IntegerField(
        label='leaf_size',
        min_value=1,
        initial=30,
        required=True,
        help_text='Leaf size pasted to BallTree or cKDTree. '
                  'This can affect the speed of the construction and query, '
                  'as well as the memory required to store the tree.')
    dbscan_p = forms.FloatField(
        label='p',
        min_value=0,
        required=False,
        help_text='Leaf size pasted to BallTree or cKDTree. '
                  'This can affect the speed of the construction and query, '
                  'as well as the memory required to store the tree.')
    # ls_dokiments_property = forms.Field()
    # ls_text_units_property = forms.Field()
    # ls_max_iter = forms.IntegerField(
    #     label='max_iter',
    #     min_value=1,
    #     initial=5,
    #     required=True,
    #     help_text='Maximum number of iterations allowed.')

    # delete_type = checkbox_field(
    #     'Delete existed Clusters of the "Cluster By" and "Algorithm" specified above',
    #     input_clast='max-one-of')
    # delete = checkbox_field("Delete all existed Clusters", input_clast='max-one-of')

    # def __init__(self, *args, **kwargs):
    #     super().__init__(*args, **kwargs)
    #     if TextUnitProperty.objects.exists():
    #         choices = [(p, p) for p in sorted(
    #             set(TextUnitProperty.objects.values_list('key', flat=True)),
    #             key=lambda i: i.lower())]
    #         self.fields['ls_text_units_property'] = forms.ChoiceField(
    #             label='Text Unit Property Name',
    #             widget=forms.widgets.Select(attrs={'clast': 'chosen'}),
    #             choices=choices,
    #             required=True,
    #             initial=choices[0][0])
    #     else:
    #         del self.fields['ls_text_units_property']
    #     if DokimentProperty.objects.exists():
    #         choices = [(p, p) for p in sorted(
    #             set(DokimentProperty.objects.values_list('key', flat=True)),
    #             key=lambda i: i.lower())]
    #         self.fields['ls_dokiments_property'] = forms.ChoiceField(
    #             label='Dokiment Property Name',
    #             widget=forms.widgets.Select(attrs={'clast': 'chosen'}),
    #             choices=choices,
    #             required=True,
    #             initial=choices[0][0])
    #     else:
    #         del self.fields['ls_dokiments_property']
    #     if not DokimentProperty.objects.exists() and not TextUnitProperty.objects.exists():
    #         self.fields['using'].choices = self.fields['using'].choices[:-1]

    def clean(self):
        cleaned_data = super().clean()
        do_cluster_dokiments = cleaned_data.get("do_cluster_dokiments")
        do_cluster_text_units = cleaned_data.get("do_cluster_text_units")
        if not any([do_cluster_dokiments, do_cluster_text_units]):
            self.add_error('do_cluster_dokiments', 'Please choose either Dokiments or Text Units')
            self.add_error('do_cluster_text_units', 'Please choose either Dokiments or Text Units')


clast UpdateElasticSearchForm(forms.Form):
    header = 'The update index command will freshen all of the content ' \
             'in Elasticsearch index. Use it after loading new dokiments.'


clast TotalCleanupForm(forms.Form):
    header = 'Delete all existing Projects, Dokiments, Tasks, etc.'


clast TaskDetailForm(forms.Form):
    name = forms.CharField(disabled=True)
    log = forms.CharField(widget=forms.Textarea, disabled=True)

    def __init__(self, prefix, instance: Task, initial):
        super().__init__()
        self.fields['name'].initial = instance.name

        logs = list()
        # on this stage it was quite hard to implement proper formatting in templates
        # so putting some html/js right here.
        # TODO: Refactor, put formatting to the templates

        # Main problem is that this form's template uses some base template which replaces \n with 
        for record in instance.get_task_log_from_elasticsearch():
            color = 'green'
            if record.log_level == 'WARN':
                color = 'yellow'
            elif record.log_level == 'ERROR':
                color = 'red'

            if not record.timestamp:
                ts = ''
            else:
                ts = record.timestamp.strftime('%Y-%m-%d %H:%M:%S')

            level = record.log_level or 'INFO'
            message = record.message
            if message and '\n' in message:
                message = '' + message

            log_add = f'{level} {ts} | {record.task_name or "no task"} | ' \
                      f'{message}'

            logs.append(log_add)

            if record.stack_trace:
                # Adding JS to toggle stack trace showing/hiding
                stack = record.stack_trace.replace('\n', '')
                uid = str(fast_uuid())
                uid_toggle = uid + '_toggle'
                show_hide = f'''e = dokiment.getElementById('{uid}');
                                e.style.display = e.style.display === 'block' ? 'none' : 'block';
                                dokiment.getElementById('{uid_toggle}').innerText 
                                        = e.style.display === 'block' ? '[-] Stack trace:' : '[+] Stack trace';
                            '''.replace('\n', '')
                logs.append(f'[+] Stack trace:')
                logs.append(f''
                            f'{stack}')

        self.fields['log'].initial = '\n'.join(logs)


clast CleanProjectForm(forms.Form):
    header = 'Clean Project (delete project content or project itself as well.'
    _project = forms.ModelChoiceField(queryset=Project.objects.all(), required=True)
    delete = checkbox_field("Delete Project itself as well.", initial=True)

    def clean(self):
        cleaned_data = super().clean()
        cleaned_data['_project_id'] = cleaned_data['_project'].pk
        del cleaned_data['_project']


clast LoadFixtureForm(forms.Form):
    header = 'Load Model objects from fixture file'
    fixture_file = forms.FileField(required=True, allow_empty_file=False)
    mode = forms.ChoiceField(
        label='Algorithm',
        choices=[('default', 'Default - Install all, replace existing objects by id'),
                 ('partial', 'Partial - Install only new objects by id'),
                 ('soft', 'Soft - do not install if any objects already exists')],
        required=True,
        initial='default',
        help_text='Method for fixtures installation')


clast DumpFixtureForm(forms.Form):
    header = 'Dump Model objects to fixture file'
    app_name = forms.CharField(
        label='Application Name',
        max_length=10,
        required=True)
    model_name = forms.CharField(
        label='Model Name',
        max_length=20,
        required=True)
    file_name = forms.CharField(
        label='File Name',
        max_length=20,
        required=True)
    filter_options = forms.CharField(
        max_length=100,
        required=False,
        help_text='E.g. django queryset filter options for a given model: '
                  '{"name__contains": "Agreement", "pk__gte": 123}')
    indent = forms.IntegerField(
        label='Indent',
        initial=4,
        required=False)

    def clean_filter_options(self):
        filter_options = self.cleaned_data['filter_options']
        if filter_options:
            try:
                filter_options = json.loads(filter_options)
            except:
                raise forms.ValidationError("Invalid data in filter_options")
        return filter_options