From 4613366036165900b24057916091cfbac6a53505 Mon Sep 17 00:00:00 2001 From: Janus Varmarken Date: Sat, 11 Aug 2018 13:34:52 -0700 Subject: [PATCH] Implement a generic version of the sequence alignment algorithm from Kleinberg's and Tardo's 'Algorithm Design' --- .../.idea/modules/SmartPlugDetector_main.iml | 1 + .../.idea/modules/SmartPlugDetector_test.iml | 5 +- .../seqalignment/AlignmentPricer.java | 70 +++++ .../SampleIntegerAlignmentPricer.java | 23 ++ .../seqalignment/SequenceAlignment.java | 75 +++++ .../test/SequenceAlignmentTest.java | 279 ++++++++++++++++++ 6 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/AlignmentPricer.java create mode 100644 Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SampleIntegerAlignmentPricer.java create mode 100644 Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SequenceAlignment.java create mode 100644 Code/Projects/SmartPlugDetector/src/test/java/edu/uci/iotproject/test/SequenceAlignmentTest.java diff --git a/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_main.iml b/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_main.iml index 0932ae0..5134c0a 100644 --- a/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_main.iml +++ b/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_main.iml @@ -5,6 +5,7 @@ + diff --git a/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_test.iml b/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_test.iml index f4752a0..b332c40 100644 --- a/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_test.iml +++ b/Code/Projects/SmartPlugDetector/.idea/modules/SmartPlugDetector_test.iml @@ -3,7 +3,10 @@ - + + + + diff --git a/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/AlignmentPricer.java b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/AlignmentPricer.java new file mode 100644 index 0000000..0552279 --- /dev/null +++ b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/AlignmentPricer.java @@ -0,0 +1,70 @@ +package edu.uci.iotproject.comparison.seqalignment; + +import java.util.function.ToIntBiFunction; +import java.util.function.ToIntFunction; + +/** + * Provides a generic implementation for the calculation of the cost of aligning two elements of a sequence as part of + * the sequence alignment algorithm (the algorithm is implemented in {@link SequenceAlignment}). + * + * @param The type of the elements that are being aligned. + * + * @author Janus Varmarken {@literal } + * @author Rahmadi Trimananda {@literal } + */ +public class AlignmentPricer { + + /** + * A function that provides the cost of aligning a {@link T} with a gap. + */ + private final ToIntFunction mGapCostFunction; + + /** + * A function that provides the cost of aligning a {@link T} with some other {@link T}. + */ + private final ToIntBiFunction mAlignmentCostFunction; + + /** + * Constructs a new {@link AlignmentPricer}. + * + * @param alignmentCostFunction A function that specifies the cost of aligning a {@link T} with some other {@link T} + * (e.g., based on the values of the properties of the two instances). + * @param gapCostFunction A function that specifies the cost of aligning a {@link T} with a gap. Note that the + * function is free to specify different gap costs for different {@link T}s. + */ + public AlignmentPricer(ToIntBiFunction alignmentCostFunction, ToIntFunction gapCostFunction) { + mAlignmentCostFunction = alignmentCostFunction; + mGapCostFunction = gapCostFunction; + } + + /** + * Calculate the cost of aligning {@code item1} with {@code item2}. If either of the two arguments is set to + * {@code null}, the cost of aligning the other argument with a gap will be returned. Note that both arguments + * cannot be {@code null} at the same time as that translates to aligning a gap with a gap, which is pointless. + * + * @param item1 The first of the two aligned objects. Set to {@code null} to calculate the cost of aligning + * {@code item2} with a gap. + * @param item2 The second of the two aligned objects. Set to {@code null} to calculate the cost of aligning + * {@code item2} with a gap. + * @return The cost of aligning {@code item1} with {@code item2}. + */ + public int alignmentCost(T item1, T item2) { + // If both arguments are null, the caller is aligning a gap with a gap which is pointless might as well remove + // both gaps in that case!) + if (item1 == null && item2 == null) { + throw new IllegalArgumentException("Both arguments cannot be null: you are aligning a gap with a gap!"); + } + // If one item is null, it means we're aligning an int with a gap. + // Invoke the provided gap cost function to get the gap cost. + if (item1 == null) { + return mGapCostFunction.applyAsInt(item2); + } + if (item2 == null) { + return mGapCostFunction.applyAsInt(item1); + } + // If both arguments are present, we simply delegate the task of calculating the cost of aligning the two items + // to the provided alignment cost function. + return mAlignmentCostFunction.applyAsInt(item1, item2); + } + +} diff --git a/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SampleIntegerAlignmentPricer.java b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SampleIntegerAlignmentPricer.java new file mode 100644 index 0000000..a09a10d --- /dev/null +++ b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SampleIntegerAlignmentPricer.java @@ -0,0 +1,23 @@ +package edu.uci.iotproject.comparison.seqalignment; + +/** + * A sample {@link AlignmentPricer} for computing the cost of aligning integer values. In this sample implementation, + * the cost of aligning two integers {@code i1} and {@code i2} is {@code Math.abs(i1 - i2)}, i.e., it is the absolute + * value of the difference between {@code i1} and {@code i2}. The cost of aligning an integer {@code i} with a gap is + * simply {@code i}, i.e., the gap is essentially treated as a zero. + * + * @author Janus Varmarken {@literal } + * @author Rahmadi Trimananda {@literal } + */ +public class SampleIntegerAlignmentPricer extends AlignmentPricer { + + /** + * Constructs a new {@link SampleIntegerAlignmentPricer}. + */ + public SampleIntegerAlignmentPricer() { + // Cost of aligning integers i1 and i2 is the absolute value of their difference. + // Cost of aligning integer i with a gap is i (as it was aligned with 0). + super((i1,i2) -> Math.abs(i1 - i2) , (i) -> i); + } + +} diff --git a/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SequenceAlignment.java b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SequenceAlignment.java new file mode 100644 index 0000000..005d7ff --- /dev/null +++ b/Code/Projects/SmartPlugDetector/src/main/java/edu/uci/iotproject/comparison/seqalignment/SequenceAlignment.java @@ -0,0 +1,75 @@ +package edu.uci.iotproject.comparison.seqalignment; + +/** + * A generic implementation of the sequence alignment algorithm given in Kleinberg's and Tardos' "Algorithm Design". + * This implementation is the basic version. There is a more complex version which significantly reduces the space + * complexity at a slight cost to time complexity. + * + * @param The unit of the alignment, or, in other words, the granularity of the + * alignment. For example, for 'classical' string alignment (as in sequence alignment where we + * try to align two strings character by character -- the example most often used in books on + * algorithms) this would be a {@link Character}. As a second example, by specifying + * {@link String}, one can decrease the granularity so as to align blocks of characters + * (e.g., if one wants to align to two string arrays). + * + * @author Janus Varmarken {@literal } + * @author Rahmadi Trimananda {@literal } + */ +public class SequenceAlignment { + + + /** + * Provides the cost of aligning two {@link ALIGNMENT_UNIT}s with one another as well as the cost of aligning an + * {@link ALIGNMENT_UNIT} with a gap. + */ + private final AlignmentPricer mAlignmentPricer; + + /** + * Constructs a new {@link SequenceAlignment}. The new instance relies on the provided {@code alignmentPricer} to + * provide the cost of aligning two {@link ALIGNMENT_UNIT}s as well as the cost of aligning an + * {@link ALIGNMENT_UNIT} with a gap. + * + * @param alignmentPricer An {@link AlignmentPricer} that provides the cost of aligning two {@link ALIGNMENT_UNIT}s + * with one another as well as the cost of aligning an {@link ALIGNMENT_UNIT} with a gap. + */ + public SequenceAlignment(AlignmentPricer alignmentPricer) { + mAlignmentPricer = alignmentPricer; + } + + + /** + * Calculates the cost of aligning {@code sequence1} with {@code sequence2}. + * + * @param sequence1 A sequence that is to be aligned with {@code sequence2}. + * @param sequence2 A sequence that is to be aligned with {@code sequence1}. + * + * @return The cost of aligning {@code sequence1} with {@code sequence2}. + */ + public int calculateAlignment(ALIGNMENT_UNIT[] sequence1, ALIGNMENT_UNIT[] sequence2) { + int[][] costs = new int[sequence1.length + 1][sequence2.length +1]; + /* + * TODO: + * This is a homebrewn initialization; it is different from the one in the Kleinberg book - is it correct? + * It tries to add support for *different* gap costs depending on the input (e.g., such that one can say that + * matching a 'c' with a gap is more expensive than matching a 'b' with a gap). + */ + for (int i = 1; i <= sequence1.length; i++) { + costs[i][0] = mAlignmentPricer.alignmentCost(sequence1[i-1], null) + costs[i-1][0]; + } + for (int j = 1; j <= sequence2.length; j++) { + costs[0][j] = mAlignmentPricer.alignmentCost(sequence2[j-1], null) + costs[0][j-1]; + } + for (int j = 1; j <= sequence2.length; j++) { + for (int i = 1; i <= sequence1.length; i++) { + // The cost when current items of both sequences are aligned. + int costAligned = mAlignmentPricer.alignmentCost(sequence2[j-1], sequence1[i-1]) + costs[i-1][j-1]; + // The cost when current item from sequence1 is not aligned (it's matched with a gap) + int seq1ItemNotMached = mAlignmentPricer.alignmentCost(sequence1[i-1], null) + costs[i-1][j]; + // The cost when current item from sequence2 is not aligned (it's matched with a gap) + int seq2ItemNotMached = mAlignmentPricer.alignmentCost(sequence2[j-1], null) + costs[i][j-1]; + costs[i][j] = Math.min(costAligned, Math.min(seq1ItemNotMached, seq2ItemNotMached)); + } + } + return costs[sequence1.length][sequence2.length]; + } +} diff --git a/Code/Projects/SmartPlugDetector/src/test/java/edu/uci/iotproject/test/SequenceAlignmentTest.java b/Code/Projects/SmartPlugDetector/src/test/java/edu/uci/iotproject/test/SequenceAlignmentTest.java new file mode 100644 index 0000000..90e8eab --- /dev/null +++ b/Code/Projects/SmartPlugDetector/src/test/java/edu/uci/iotproject/test/SequenceAlignmentTest.java @@ -0,0 +1,279 @@ +package edu.uci.iotproject.test; + +import edu.uci.iotproject.comparison.seqalignment.AlignmentPricer; +import edu.uci.iotproject.comparison.seqalignment.SequenceAlignment; +import org.junit.Before; +import org.junit.Test; + +import java.util.function.ToIntBiFunction; +import java.util.function.ToIntFunction; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +/** + * Tests the implementation of {@link SequenceAlignment}. + * + * @author Janus Varmarken {@literal } + * @author Rahmadi Trimananda {@literal } + */ +public class SequenceAlignmentTest { + + private char[] lowercaseVowels; + private char[] lowercaseConsonants; + + private Character[] meanChars; + private Character[] nameChars; + + /** + * Cost function for the alignment of letters in the example execution of the sequence alignment algorithm in + * Kleinberg's and Tardos' "Algorithm Design", where 'mean' and 'name' are aligned. + */ + private ToIntBiFunction kleinbergExampleAlignmentCostFunc; + + /** + * Cost function for the alignment of letters with gaps in the example execution of the sequence alignment algorithm + * in Kleinberg's and Tardos' "Algorithm Design", where 'mean' and 'name' are aligned. Gap cost is set to 2, + * regardless of input character. + */ + private ToIntFunction kleinbergExampleGapCostFunc; + + /** + * Calculates the cost of aligning a letter with another letter or a letter with a gap according to the cost recipe + * used in the example in Kleinberg & Tardos. + */ + private AlignmentPricer kleinbergAlignmentPricer; + + /** + * Executes the sequence alignment algorithm using the cost function defined in the example in Kleinberg & Tardos, + * i.e., {@link #kleinbergAlignmentPricer}. + */ + private SequenceAlignment kleinbergSequenceAligner; + + @Before + public void initialize() { + // We consider 'y' a vowel for the sake of simplicity. + // Note: we assume an all lowercase string! + lowercaseVowels = new char[] { 'a', 'e', 'i', 'o', 'u', 'y' }; + lowercaseConsonants = new char[] { 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', + 't', 'v', 'w', 'x', 'z' }; + kleinbergExampleAlignmentCostFunc = (c1, c2) -> { + // Unbox to primitive type for the sake of brevity in the statements to follow. + final char char1 = c1.charValue(); + final char char2 = c2.charValue(); + + // If char1 and char2 are the same characters, the cost of aligning them is 0. + if (char1 == char2) return 0; + + final boolean char1IsVowel = isVowel(char1); + final boolean char1IsConsonant = isConsonant(char1); + final boolean char2IsVowel = isVowel(char2); + final boolean char2IsConsonant = isConsonant(char2); + + // Alignment cost is undefined for non alphabet characters. + if (!char1IsVowel && !char1IsConsonant) fail("not an alphabet letter: " + char1); + if (!char2IsVowel && !char2IsConsonant) fail("not an alphabet letter: " + char2); + + // If char1 and char2 are both vowels or both consonants, the cost is 1. + if (char1IsVowel && char2IsVowel || char1IsConsonant && char2IsConsonant) return 1; + + // If one of char1 and char2 is a consonant, while the other is a vowel, the cost is 3. + return 3; + }; + // The cost of a gap is 2, regardless of what letter is aligned with the gap. + kleinbergExampleGapCostFunc = c -> 2; + + // char[] -> Character[] conversion courtesy of https://stackoverflow.com/a/27690990/1214974 + meanChars = "mean".chars().mapToObj(c -> (char)c).toArray(Character[]::new); + nameChars = "name".chars().mapToObj(c -> (char)c).toArray(Character[]::new); + + kleinbergAlignmentPricer = new AlignmentPricer<>(kleinbergExampleAlignmentCostFunc, + kleinbergExampleGapCostFunc); + + kleinbergSequenceAligner = new SequenceAlignment<>(kleinbergAlignmentPricer); + } + + @Test + public void kleinbergExampleOptAlignmentCostShouldBe6() { + // Cost of the optimal alignment of the two words + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(meanChars, nameChars); + final int expectedAlignmentCost = 6; + String msg = String.format("Kleinberg example: computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + + @Test + public void meanAlignedWithEmptyStringShouldBe8() { + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(meanChars, new Character[0]); + // 'mean' aligned with the empty string equals paying four gap costs, so total cost is: 4 * 2 = 8. + final int expectedAlignmentCost = 8; + String msg = String.format("'mean' aligned with empty string: computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void mAlignedWithNameShouldBe6() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * n a m e + * _ _ m _ + * This should have a cost of 3 * gapCost = 6 + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'm' }, nameChars); + final int expectedAlignmentCost = 6; + String msg = String.format("'m' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void meAlignedWithNameShouldBe4() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * n a m e + * _ _ m e + * This should have a cost of 2 * gapCost = 4 + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'm', 'e' }, nameChars); + final int expectedAlignmentCost = 4; + String msg = String.format("'me' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + // Check that order of arguments doesn't matter + final int optAlignmentCostReversed = kleinbergSequenceAligner.calculateAlignment(nameChars, new Character[] { 'm', 'e' }); + msg = "'me' aligned with 'name': different order of arguments unexpectedly produced different result"; + assertTrue(msg, optAlignmentCostReversed == optAlignmentCost && optAlignmentCostReversed == expectedAlignmentCost); + } + + @Test + public void ameAlignedWithNameShouldBe2() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * n a m e + * _ a m e + * This should have a cost of 1 * gapCost = 2 + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'a', 'm', 'e' }, nameChars); + final int expectedAlignmentCost = 2; + String msg = String.format("'ame' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void fameAlignedWithNameShouldBe1() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * n a m e + * f a m e + * This should have a cost of 1 * consonantMatchedWithConsonantCost = 1 + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'f', 'a', 'm', 'e' }, + nameChars); + final int expectedAlignmentCost = 1; + String msg = String.format("'fame' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void nameAlignedWithNameShouldBe0() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * n a m e + * n a m e + * This should have a cost of 0. + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'n', 'a', 'm', 'e' }, + nameChars); + final int expectedAlignmentCost = 0; + String msg = String.format("'name' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void emanAlignedWithNameShouldBe6() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * + * _ n a m e + * e m a n _ + * + * or + * + * n a m e _ + * _ e m a n + * + * This should have a cost of 2 * gapCost + 2 * consonantMatchedWithConsonantCost = 2 * 2 + 2 * 1 = 6. + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'e', 'm', 'a', 'n' }, + nameChars); + final int expectedAlignmentCost = 6; + String msg = String.format("'eman' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + @Test + public void naemAlignedWithNameShouldBe4() { + /* + * Note: this also uses the cost function specified in Kleinberg & Tardos. + * Best alignment should be: + * + * n a _ m e + * n a e m _ + * + * or + * + * n a m e _ + * n a _ e m + * + * This should have a cost of 2 * gapCost = 4. + */ + final int optAlignmentCost = kleinbergSequenceAligner.calculateAlignment(new Character[] { 'n', 'a', 'e', 'm' }, + nameChars); + final int expectedAlignmentCost = 4; + String msg = String.format("'naem' aligned with 'name': computed opt != expected opt (computed=%d expected=%d)", + optAlignmentCost, expectedAlignmentCost); + assertTrue(msg, optAlignmentCost == expectedAlignmentCost); + } + + + /** + * Checks if {@code letter} is a lowercase vowel. Note: for simplicity, 'y' is considered a vowel. + * @param letter A {@code char} expected to be a vowel. + * @return {@code true} if {@code letter} is a vowel, {@code false} otherwise. + */ + private boolean isVowel(char letter) { + for (char vowel : lowercaseVowels) { + if (letter == vowel) { + return true; + } + } + return false; + } + + /** + * Checks if {@code letter} is a lowercase consonant. Note: for simplicity, 'y' is considered a vowel. + * @param letter A {@code char} expected to be a consonant. + * @return {@code true} if {@code letter} is a consonant, {@code false} otherwise. + */ + private boolean isConsonant(char letter) { + for (char consonant : lowercaseConsonants) { + if (letter == consonant) { + return true; + } + } + return false; + } +} -- 2.34.1