import { compact, each, filter, flow, groupBy, includes, keys, last, some, sortBy, uniq, uniqueId } from 'lodash-es';

import Identifiers from '../../util/identifiers';

const ROW_SAMPLE_COUNT = 1000;

// The Column class represents the data included in a single column of the file
// uploaded by a user.
class Column {
  constructor(header, rows) {
    // A unique ID is generated for each column so that we can reliably key them
    // when rendering React components – we can't use the header for this, as
    // users may include duplicate headers.
    this.id = uniqueId();

    this.header = header;
    this.rows = rows;

    // Immediately upon creation, we try to guess the type of this column based
    // on the headers and content.
    let { guessedFieldType, guessedFieldTypeConfidence } = this.guessedFieldType();

    // Each column stores the guessed column type and confidence in these
    // properties. However, we can't determine what type we have assigned to a
    // column purely on this basis – user selections and the detected type or
    // confidence of other columns can affect the type we select.
    this.interimGuessedFieldType = guessedFieldType;
    this.interimGuessedFieldTypeConfidence = guessedFieldTypeConfidence;

    // A null field type indicates that this column will not be imported. Every
    // column starts with this value, and the actual field type to use is
    // determined externally by the column builder.
    this.fieldType = null;
  }

  // guessedFieldType returns an object with two fields: guessedFieldType, and
  // guessedFieldTypeConfidence. If we cannot infer a field type from any
  // available information, then we will guess that the column should not be
  // imported (i.e. null field type).
  guessedFieldType() {
    // Try to guess the field type first by looking at the header. If the header
    // matches an expected type, then there's a pretty good chance that this
    // column contains that kind of information, so we set the confidence to 1.0
    let headerGuess = this.guessTypeByHeader();
    if (headerGuess) {
      return {
        guessedFieldType: headerGuess,
        guessedFieldTypeConfidence: 1.0
      };
    }

    // Try to guess the field type by sniffing the content in the actual rows.
    let { contentSniffedGuess, contentSniffedGuessConfidence } = this.guessTypeByContentSniffing();
    if (contentSniffedGuess) {
      return {
        guessedFieldType: contentSniffedGuess,
        guessedFieldTypeConfidence: contentSniffedGuessConfidence
      };
    }

    // We could not infer a type from the headers or content, so we are going
    // to guess that this field should not be imported.
    return { guessedFieldType: null, guessedFieldTypeConfidence: undefined };
  }

  // Guess the field type by looking at the content of the header. This is done
  // with a series of regular expressions that look for particular keywords.
  // This could cause problems – may need to be revisted after further internal
  // testing.
  guessTypeByHeader() {
    if (this.headerLooksLikeAuthors) {
      return 'authors';
    }
    if (this.headerLooksLikeDepartments) {
      return 'departments';
    }
    if (this.headerLooksLikeDoi) {
      return 'doi';
    }
    if (this.headerLooksLikePubmed) {
      return 'pmid';
    }
    if (this.headerLooksLikeNctId) {
      return 'nct_id';
    }
    if (this.headerLooksLikeRepecId) {
      return 'repec_id';
    }
    if (this.headerLooksLikeArxivId) {
      return 'arxiv_id';
    }
    if (this.headerLooksLikeAdsBibcode) {
      return 'ads_bibcode';
    }
    if (this.headerLooksLikeHandle) {
      return 'handle';
    }
    if (this.headerLooksLikeUrn) {
      return 'urn';
    }
    if (this.headerLooksLikeUri) {
      return 'uri';
    }
    if (this.headerLooksLikeIsbn) {
      return 'isbn';
    }

    return null;
  }

  get headerLooksLikeAuthors() {
    return /author|person|people/i.test(this.header);
  }

  get headerLooksLikeDepartments() {
    return /department|group|grant/i.test(this.header);
  }

  get headerLooksLikeDoi() {
    return /\bdoi\b/i.test(this.header);
  }

  get headerLooksLikePubmed() {
    return /\bpubmed\b|pmid/i.test(this.header);
  }

  get headerLooksLikeNctId() {
    return /clinical trial|nct id/i.test(this.header);
  }

  get headerLooksLikeRepecId() {
    return /repec/i.test(this.header);
  }

  get headerLooksLikeArxivId() {
    return /arxiv/i.test(this.header);
  }

  get headerLooksLikeAdsBibcode() {
    return /bibcode|\bads\b/i.test(this.header);
  }

  get headerLooksLikeHandle() {
    return /handle/i.test(this.header);
  }

  get headerLooksLikeUrn() {
    return /\burn\b/i.test(this.header);
  }

  get headerLooksLikeUri() {
    return /\bur(l|i)\b/i.test(this.header);
  }

  get headerLooksLikeIsbn() {
    return /isbn/i.test(this.header);
  }

  // Guess the field type by looking at the content of the rows. The basic
  // approach is:
  //
  // - Take a sample of the rows that contain any content
  // - Check each row to see what kind of identifier the Identifier library
  //   thinks it contains
  // - Return the guessed type based on this data
  guessTypeByContentSniffing() {
    // Use the Identifier library to determine the type of each row, and get the
    // distinct types that were found
    let types = this.sampleRows.map((row) => Identifiers.typeOf(row));
    let distinctTypes = flow(compact, uniq)(types);

    // If there is only a single distinct type, we can be fairly confident that
    // this column contains the detected type.
    if (distinctTypes.length === 1) {
      return {
        contentSniffedGuess: distinctTypes[0],
        contentSniffedGuessConfidence: 0.9
      };

      // If there were a limited number of detected types, then we select the one
      // with the highest plurality in the list. This allows us to avoid throwing
      // out a guess just because some rows were preferentially detected as the
      // wrong type (e.g. DOIs being extracted from URIs)
    } else if (distinctTypes.length <= 3) {
      // Count the number of occurances of each identifier and pick the one with
      // the highest count as our guess
      let counts = {};
      types.forEach((type) => (counts[type] ? counts[type]++ : (counts[type] = 1)));
      let selectedType = flow((t) => sortBy(t, distinctTypes, (type) => counts[type]), last)(distinctTypes);

      // Include this as a relatively low confidence figure so that we can
      // override it if we get a better candidate elsewhere.
      return {
        contentSniffedGuess: selectedType,
        contentSniffedGuessConfidence: 0.5
      };
    } else {
      return {
        contentSniffedGuess: null
      };
    }
  }

  // Convenience method that returns a preview of this column for the given row
  // (for use in the UI)
  preview(row) {
    return this.rows[row];
  }

  // Returns a sample of the populated rows for this column.
  get sampleRows() {
    return compact(this.rows.slice(0, ROW_SAMPLE_COUNT));
  }

  // Returns true if this column has a field type that indicates it contains
  // identifiers
  get isIdentifierColumn() {
    return this.fieldType && includes(Identifiers.TYPES, this.fieldType);
  }

  // Returns true if this column has a non-identifier field type
  get isInstitutionalColumn() {
    return this.fieldType && !this.isIdentifierColumn;
  }

  // Returns the normalized data for the supplied row index, triggering a
  // normalization of the data if it has not yet been normalized.
  normalizedRow(index) {
    if (!this.normalizedRows) this.normalizeContent();

    return this.normalizedRows[index];
  }

  // Normalize all of the data in this column according to the currently
  // specified field type.
  normalizeContent() {
    if (this.isIdentifierColumn) {
      // Identifier columns are run through the Identifiers library for the
      // selected field type, and the first recognised identifier returned
      this.normalizedRows = this.rows.map((row) => Identifiers.extract(this.fieldType, row)[0]);
    } else {
      // Non-identifier columns can contain multiple semicolon-delimited values,
      // so these are extracted individually at this stage.
      this.normalizedRows = this.rows.map((row) => {
        return compact(
          row
            .trim()
            .split(/;\s*/)
            .map((t) => t.trim())
        );
      });
    }
  }

  // Return a value indicating if this column potentially contains invalid data
  get warning() {
    if (!this.fieldType) return false;
    if (!some(this.rows, (r) => r && r.length)) return true;
    if (this.hasPossiblyInvalidType) return true;

    return false;
  }

  // Get a warning message about the content of this column, if one is required
  get warningMessage() {
    if (!this.fieldType) return;

    if (!some(this.rows, (r) => r && r.length)) {
      return I18n.t('dataset_upload.outputs_configuration.warnings.empty_column');
    }

    if (this.hasPossiblyInvalidType) {
      let detectedType = I18n.t(this.interimGuessedFieldType, {
        scope: 'dataset_upload.outputs_configuration.types'
      });
      let selectedType = I18n.t(this.fieldType, {
        scope: 'dataset_upload.outputs_configuration.types'
      });

      return I18n.t('dataset_upload.outputs_configuration.warnings.wrong_type', {
        detectedType,
        selectedType
      });
    }

    return;
  }

  get hasPossiblyInvalidType() {
    return this.interimGuessedFieldType && this.interimGuessedFieldType !== this.fieldType;
  }
}

// The ColumnBuilder consumes results loaded from a CSV file and returns the
// columns that it contains, with appropriate field types for each being
// automatically inferred.
class ColumnBuilder {
  constructor(results) {
    this.results = results;
  }

  get columns() {
    // Extract headers from the CSV result set, and convert each column into
    // a Column object for the deta it contains.
    let headers = keys(this.results[0] || {});
    let columns = headers.map((header) => {
      return new Column(
        header,
        this.results.map((result) => result[header])
      );
    });

    // Group columns by the type of data that has been automatically detected
    // in them, and determine which one of these columns should 'win'.
    flow(
      (c) => filter(c, 'interimGuessedFieldType'),
      (c) => groupBy(c, 'interimGuessedFieldType'),
      (c) =>
        each(c, (columnsOfType, type) => {
          // If this type of column contains identifiers, then we can only support
          // one column for imports. We will choose the last column that has that
          // type as the winner, and will set its fieldType field.
          if (includes(Identifiers.TYPES, type)) {
            flow((t) => sortBy(t, 'interimGuessedFieldTypeConfidence'), last)(columnsOfType).fieldType = type;

            // If this is not an identifier-type column, then multiple columns can
            // have the same type – set the appropriate field type on each.
          } else {
            each(columnsOfType, (c) => (c.fieldType = type));
          }
        })
    )(columns);

    return columns;
  }
}

export { ColumnBuilder, Column };
