哈希表 & List & Dictionary

MapleRiver

1.List

List可以认为是一个可以自动扩容的数组，每次扩容后容量翻倍

构造函数:

public class List<T> : IList<T>, System.Collections.IList, IReadOnlyList<T>
{
    private const int _defaultCapacity = 4;

    private T[] _items;
    private int _size;
    private int _version;
    private Object _syncRoot;

    static readonly T[]  _emptyArray = new T[0];        

    // Constructs a List. The list is initially empty and has a capacity
    // of zero. Upon adding the first element to the list the capacity is
    // increased to 16, and then increased in multiples of two as required.
    public List() {
        _items = _emptyArray;
    }

    // Constructs a List with a given initial capacity. The list is
    // initially empty, but will have room for the given number of elements
    // before any reallocations are required.
    // 
    public List(int capacity) {
        if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);
        Contract.EndContractBlock();

        if (capacity == 0)
            _items = _emptyArray;
        else
            _items = new T[capacity];
    }

    //...
    //其他内容
}

public class List<T> : IList<T>, System.Collections.IList, IReadOnlyList<T>

{

private const int _defaultCapacity = 4;

private T[] _items;

private int _size;

private int _version;

private Object _syncRoot;

static readonly T[] _emptyArray = new T[0];

// Constructs a List. The list is initially empty and has a capacity

// of zero. Upon adding the first element to the list the capacity is

// increased to 16, and then increased in multiples of two as required.

public List() {

_items = _emptyArray;

}

// Constructs a List with a given initial capacity. The list is

// initially empty, but will have room for the given number of elements

// before any reallocations are required.

public List(int capacity) {

if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity, ExceptionResource.ArgumentOutOfRange_NeedNonNegNum);

Contract.EndContractBlock();

if (capacity == 0)

_items = _emptyArray;

else

_items = new T[capacity];

}

//...

//其他内容

}

Add函数（自动扩容）：

// Adds the given object to the end of this list. The size of the list is
// increased by one. If required, the capacity of the list is doubled
// before adding the new element.
//
public void Add(T item) {
    if (_size == _items.Length) EnsureCapacity(_size + 1);
    _items[_size++] = item;
    _version++;
}

// Ensures that the capacity of this list is at least the given minimum
// value. If the currect capacity of the list is less than min, the
// capacity is increased to twice the current capacity or to min,
// whichever is larger.
private void EnsureCapacity(int min) {
    if (_items.Length < min) {
        int newCapacity = _items.Length == 0? _defaultCapacity : _items.Length * 2;
        // Allow the list to grow to maximum possible capacity (~2G elements) before encountering overflow.
        // Note that this check works even when _items.Length overflowed thanks to the (uint) cast
        if ((uint)newCapacity > Array.MaxArrayLength) newCapacity = Array.MaxArrayLength;
        if (newCapacity < min) newCapacity = min;
        Capacity = newCapacity;
    }
}

// Adds the given object to the end of this list. The size of the list is

// increased by one. If required, the capacity of the list is doubled

// before adding the new element.

public void Add(T item) {

if (_size == _items.Length) EnsureCapacity(_size + 1);

_items[_size++] = item;

_version++;

}

// Ensures that the capacity of this list is at least the given minimum

// value. If the currect capacity of the list is less than min, the

// capacity is increased to twice the current capacity or to min,

// whichever is larger.

private void EnsureCapacity(int min) {

if (_items.Length < min) {

int newCapacity = _items.Length == 0? _defaultCapacity : _items.Length * 2;

// Allow the list to grow to maximum possible capacity (~2G elements) before encountering overflow.

// Note that this check works even when _items.Length overflowed thanks to the (uint) cast

if ((uint)newCapacity > Array.MaxArrayLength) newCapacity = Array.MaxArrayLength;

if (newCapacity < min) newCapacity = min;

Capacity = newCapacity;

}

Remove和Inset函数并不是单纯的清理值，而是进行数组的copy：

// Removes the element at the given index. The size of the list is
// decreased by one.
// 
public bool Remove(T item) {
    int index = IndexOf(item);
    if (index >= 0) {
        RemoveAt(index);
        return true;
    }

    return false;
}

// Returns the index of the first occurrence of a given value in a range of
// this list. The list is searched forwards from beginning to end.
// The elements of the list are compared to the given value using the
// Object.Equals method.
// 
// This method uses the Array.IndexOf method to perform the
// search.
// 
public int IndexOf(T item) {
    Contract.Ensures(Contract.Result<int>() >= -1);
    Contract.Ensures(Contract.Result<int>() < Count);
    return Array.IndexOf(_items, item, 0, _size);
}

// Removes the element at the given index. The size of the list is
// decreased by one.
// 
public void RemoveAt(int index) {
    if ((uint)index >= (uint)_size) {
        ThrowHelper.ThrowArgumentOutOfRangeException();
    }
    Contract.EndContractBlock();
    _size--;
    if (index < _size) {
        Array.Copy(_items, index + 1, _items, index, _size - index);
    }
    _items[_size] = default(T);
    _version++;
}
// Inserts an element into this list at a given index. The size of the list
// is increased by one. If required, the capacity of the list is doubled
// before inserting the new element.
// 
public void Insert(int index, T item) {
    // Note that insertions at the end are legal.
    if ((uint) index > (uint)_size) {
        ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index, ExceptionResource.ArgumentOutOfRange_ListInsert);
    }
    Contract.EndContractBlock();
    if (_size == _items.Length) EnsureCapacity(_size + 1);
    if (index < _size) {
        Array.Copy(_items, index, _items, index + 1, _size - index);
    }
    _items[index] = item;
    _size++;            
    _version++;
}

// Removes the element at the given index. The size of the list is

// decreased by one.

public bool Remove(T item) {

int index = IndexOf(item);

if (index >= 0) {

RemoveAt(index);

return true;

}

return false;

}

// Returns the index of the first occurrence of a given value in a range of

// this list. The list is searched forwards from beginning to end.

// The elements of the list are compared to the given value using the

// Object.Equals method.

// This method uses the Array.IndexOf method to perform the

// search.

public int IndexOf(T item) {

Contract.Ensures(Contract.Result<int>() >= -1);

Contract.Ensures(Contract.Result<int>() < Count);

return Array.IndexOf(_items, item, 0, _size);

}

// Removes the element at the given index. The size of the list is

// decreased by one.

public void RemoveAt(int index) {

if ((uint)index >= (uint)_size) {

ThrowHelper.ThrowArgumentOutOfRangeException();

}

Contract.EndContractBlock();

_size--;

if (index < _size) {

Array.Copy(_items, index + 1, _items, index, _size - index);

}

_items[_size] = default(T);

_version++;

}

// Inserts an element into this list at a given index. The size of the list

// is increased by one. If required, the capacity of the list is doubled

// before inserting the new element.

public void Insert(int index, T item) {

// Note that insertions at the end are legal.

if ((uint) index > (uint)_size) {

ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.index, ExceptionResource.ArgumentOutOfRange_ListInsert);

}

Contract.EndContractBlock();

if (_size == _items.Length) EnsureCapacity(_size + 1);

if (index < _size) {

Array.Copy(_items, index, _items, index + 1, _size - index);

}

_items[index] = item;

_size++;

_version++;

}

2.哈希函数与哈希表

哈希函数：

将哈希表中元素的关键键值映射为元素存储位置的函数。

哈希表：

结构中存储了key的哈希值的数据结构。

一般的链表、树结构中存储位置是随机的，查找起来效率低，哈希表内存储了哈希值，可以快速计算要查找项的哈希值，并查找对应数据。

举个栗子：

我们现在需要存储一个键值对数据A-B，key为A，value为B。但是A和B所占用的内存较大，而且查找的时候比对也很困难。

这个时候我们可以使用哈希表来增加索引速度：将A通过哈希函数计算得到A的哈希值，然后将A的哈希值和A都存在哈希表内，这样在查找的时候就可以直接计算要查找目标的哈希值然后去哈希表内进行比对，就可以快速查找到对应数据。

哈希表的冲突：

key的哈希值是通过哈希函数计算得到的，那么就难以避免的会出现不同的key值通过哈希函数后得到相同的哈希值。

这种情况我们成为哈希冲突，解决哈希冲突的方法比较多，如拉链法，多哈希法，开放地址法等等。不过目前只有拉链法可以完全解决哈希冲突的问题。

拉链法：

主要思想就是在存储时使用一个额外的动态链表代替顺序存储结构。

以C#的Dictionary为例，拉链法的实现步骤为：

1.计算Key的HashValue

2.根据 HashValue 定位到 table[hashIndex]（table[hashIndex] 为一个链表的Node）

3.如果 table[hashIndex] 为空则直接插入，不然则添加到链表末尾

3.Dictionary

数据存储与构造函数：

       private struct Entry {
            public int hashCode;    // Lower 31 bits of hash code, -1 if unused
            public int next;        // Index of next entry, -1 if last
            public TKey key;           // Key of entry
            public TValue value;         // Value of entry
        }

        private int[] buckets;
        private Entry[] entries;

        public Dictionary(int capacity, IEqualityComparer<TKey> comparer) {
            if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity);
            if (capacity > 0) Initialize(capacity);
            this.comparer = comparer ?? EqualityComparer<TKey>.Default;
            //其他代码
        }

        private void Initialize(int capacity) {
            int size = HashHelpers.GetPrime(capacity);
            buckets = new int[size];
            for (int i = 0; i < buckets.Length; i++) buckets[i] = -1;
            entries = new Entry[size];
            freeList = -1;
        }

private struct Entry {

public int hashCode; // Lower 31 bits of hash code, -1 if unused

public int next; // Index of next entry, -1 if last

public TKey key; // Key of entry

public TValue value; // Value of entry

}

private int[] buckets;

private Entry[] entries;

public Dictionary(int capacity, IEqualityComparer<TKey> comparer) {

if (capacity < 0) ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.capacity);

if (capacity > 0) Initialize(capacity);

this.comparer = comparer ?? EqualityComparer<TKey>.Default;

//其他代码

}

private void Initialize(int capacity) {

int size = HashHelpers.GetPrime(capacity);

buckets = new int[size];

for (int i = 0; i < buckets.Length; i++) buckets[i] = -1;

entries = new Entry[size];

freeList = -1;

}

可以看到初始化的时候主要就是初始化了size、buckets和entries。

Entry：拉链表中动态链表的节点，也就是上面说的Node

buckets与entries：buckets存储哈希值对应到的Entry的索引，entries则为node的存储集合。

添加节点

         public void Add(TKey key, TValue value) {
            Insert(key, value, true);
        }

        private void Insert(TKey key, TValue value, bool add) {
            if( key == null ) {
                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
            }

            if (buckets == null) Initialize(0);
            int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;
            int targetBucket = hashCode % buckets.Length;

#if FEATURE_RANDOMIZED_STRING_HASHING
            int collisionCount = 0;
#endif

            for (int i = buckets[targetBucket]; i >= 0; i = entries[i].next) {
                if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) {
                    if (add) { 
                        ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate);
                    }
                    entries[i].value = value;
                    version++;
                    return;
                } 

#if FEATURE_RANDOMIZED_STRING_HASHING
                collisionCount++;
#endif
            }
            int index;
            if (freeCount > 0) {
                index = freeList;
                freeList = entries[index].next;
                freeCount--;
            }
            else {
                if (count == entries.Length)
                {
                    Resize();
                    targetBucket = hashCode % buckets.Length;
                }
                index = count;
                count++;
            }

            entries[index].hashCode = hashCode;
            entries[index].next = buckets[targetBucket];
            entries[index].key = key;
            entries[index].value = value;
            buckets[targetBucket] = index;
            version++;
            //一堆宏处理
        }

public void Add(TKey key, TValue value) {

Insert(key, value, true);

}

private void Insert(TKey key, TValue value, bool add) {

if( key == null ) {

ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);

}

if (buckets == null) Initialize(0);

int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;

int targetBucket = hashCode % buckets.Length;

#if FEATURE_RANDOMIZED_STRING_HASHING

int collisionCount = 0;

#endif

for (int i = buckets[targetBucket]; i >= 0; i = entries[i].next) {

if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) {

if (add) {

ThrowHelper.ThrowArgumentException(ExceptionResource.Argument_AddingDuplicate);

}

entries[i].value = value;

version++;

return;

}

#if FEATURE_RANDOMIZED_STRING_HASHING

collisionCount++;

#endif

}

int index;

if (freeCount > 0) {

index = freeList;

freeList = entries[index].next;

freeCount--;

}

else {

if (count == entries.Length)

{

Resize();

targetBucket = hashCode % buckets.Length;

}

index = count;

count++;

}

entries[index].hashCode = hashCode;

entries[index].next = buckets[targetBucket];

entries[index].key = key;

entries[index].value = value;

buckets[targetBucket] = index;

version++;

//一堆宏处理

}

可以看到：当来了一个key-value要存储时，先通过哈希函数计算key的哈希值，然后再计算targetBucket，即这个哈希值应该放到哪个buckets桶内（其实就是计算的哈希值与buckets长度取余），比如这里计算后的结果是1号桶。

通过buckets[1]可以拿到对应节点Entry的ID，然后entries[EntryID]就可以拿到动态链表的头节点。

最后添加节点到链表内：也就是构建一个新的Entry，然后这个Entry的next指向之前buckets[1]的EntryID，然后buckets[1]内缓存新构建的EntryID就可以了。

查找节点

其实不用看代码也能知道，我们把一个很长的线性表拆成了几份小的线性表，并通过哈希值进行快速查找，从而能更快的查找到结果。

        public bool ContainsKey(TKey key) {
            return FindEntry(key) >= 0;
        }
        private int FindEntry(TKey key) {
            if( key == null) {
                ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);
            }

            if (buckets != null) {
                int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;
                for (int i = buckets[hashCode % buckets.Length]; i >= 0; i = entries[i].next) {
                    if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) return i;
                }
            }
            return -1;
        }

public bool ContainsKey(TKey key) {

return FindEntry(key) >= 0;

}

private int FindEntry(TKey key) {

if( key == null) {

ThrowHelper.ThrowArgumentNullException(ExceptionArgument.key);

}

if (buckets != null) {

int hashCode = comparer.GetHashCode(key) & 0x7FFFFFFF;

for (int i = buckets[hashCode % buckets.Length]; i >= 0; i = entries[i].next) {

if (entries[i].hashCode == hashCode && comparer.Equals(entries[i].key, key)) return i;

}

return -1;

}

代码也比较简单，就是找到对应的动态链表头节点，然后依次比对哈希值和对应key值就知道是不是查找到了。

4.其他的哈希表优化

Dictionary中除了存储key值的哈希值，也缓存了要存储的key-value本身。但是有的时候为了优化资源减少内存，我们也可以使用哈希表进行内存优化。

例如：翻译表的格式为Key-Value键值对（中文-其他语言），可以计算key的哈希值，并在缓存数据时使用哈希值作为key进行存储，查找时也将要查找的key转换为哈希值后再进行匹配。

         //存值的时候优先存到哈希表内，如果哈希冲突了，放进string表
        public void Add(string key, string value)
        {
            uint hash = CalHashCode(key);
            if (!m_HashKey.ContainsKey(hash))
            {
                m_HashKey.Add(hash, value.Encode2Uint());
            }
            else
            {
                m_StringKey.Add(key, value);
            }
        }

        //取值的时候优先去string表中查找是否冲突过，如果没查到，说明可能没冲突，再去哈希表内找
        public bool TryGetValue(string key, out string value)
        {
            if (!m_StringKey.TryGetValue(key, out value))
            {
                // 如果m_StringKey不包含，则去m_HashKey中查找
                uint hash = CalHashCode(key);
                if (!m_HashKey.TryGetValue(hash, out value))
                {
                    return false;
                }
            }

            return true;
        }

//存值的时候优先存到哈希表内，如果哈希冲突了，放进string表

public void Add(string key, string value)

{

uint hash = CalHashCode(key);

if (!m_HashKey.ContainsKey(hash))

{

m_HashKey.Add(hash, value.Encode2Uint());

}

else

{

m_StringKey.Add(key, value);

}

//取值的时候优先去string表中查找是否冲突过，如果没查到，说明可能没冲突，再去哈希表内找

public bool TryGetValue(string key, out string value)

{

if (!m_StringKey.TryGetValue(key, out value))

{

// 如果m_StringKey不包含，则去m_HashKey中查找

uint hash = CalHashCode(key);

if (!m_HashKey.TryGetValue(hash, out value))

{

return false;

}

return true;

}

江枫

哈希表 & List & Dictionary

1.List

构造函数:

Add函数（自动扩容）：

Remove和Inset函数并不是单纯的清理值，而是进行数组的copy：

2.哈希函数与哈希表

哈希函数：

哈希表：

举个栗子：

哈希表的冲突：

拉链法：

3.Dictionary

数据存储与构造函数：

添加节点

查找节点

4.其他的哈希表优化

文章大纲

但行好事，莫问前程